//===================================================================================
// Copyright (c) 2021-2024    Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files(the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions :
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
//==================================================================================
//----------------------------------------------------------------------------------
// File: BC7Encode.hlsl
//
// The Compute Shader for BC7 Encoder
//
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
//----------------------------------------------------------------------------------

#ifdef ASPM_GPU
#pragma warning(disable : 3078)  // "loop control variable conflicts with a previous declaration in the outer scope"
#else                            // using CPU
#include "common_def.h"
#include "bcn_common_api.h"
#include <algorithm>
#endif

// TryMode456CS
#define ENABLE_MODE4
#define ENABLE_MODE5
#define ENABLE_MODE6

// TryMode02CS
#define ENABLE_MODE0
#define ENABLE_MODE2

// TryMode137CS
#define ENABLE_MODE1
#define ENABLE_MODE3
#define ENABLE_MODE7

//#define ENABLE_CMP_MODE0
//#define ENABLE_CMP_MODE1
//#define ENABLE_CMP_MODE2
//#define ENABLE_CMP_MODE3
//#define ENABLE_CMP_MODE4
//#define ENABLE_CMP_MODE5
#define ENABLE_CMP_MODE6
//#define ENABLE_CMP_MODE7

#define ENABLE_CMP_API
#define USE_NEW_SP_ERR_IDX
#define ENABLE_CMP_REFINE_MODE6_API  // API to improve mode 6 quality
#define MAX_TRY_SHAKER 1             // used in cmp_ep_shaker

//====================================================================================
//                          HLSL Host Simulation
//====================================================================================
// Simulate HLSL compute code on a CPU host must run single treaded
// On cpu the code simulates a single compute unit as used by CMP DXC host

// Enable SIMULATE_GPU to run simulation in CPU using HPC in CMP GUI or CMP CLI
// Note: some bcn_encode_kernel.cpp files have specific code you simulate with, enable
// the define USE_NEW_SINGLE_HEADER_INTERFACES and pick the external or local codec
// to run with.

//===========================================================================
// Prototype to degug a simple simulation of shader using shared global data
// run as single thread on CPU
// #define SIMULATE_GPU
//===========================================================================

#if !defined(ASPM_GPU)

#define THREAD_GROUP_SIZE 64
#define BLOCK_SIZE_X 4
#define BLOCK_SIZE_Y 4
#define MAX_UINT 0xFFFFFFFF
#define MIN_UINT 0x00000000

// Source Texture to process
// Texture2D g_Input;
// Normalized 0..1
struct Texture2D
{
    CGU_Vec4f Texture[16];

    CGU_Vec4f Load(CGU_Vec3ui index)
    {
        CGU_INT offset;
        offset = (index.x + (index.y * 4)) & 0x0F;
        return Texture[offset];
    };

    CGU_Vec4f Load(CGU_Vec3ui index, CGU_UINT32 z)
    {
        CMP_UNUSED(z);
        CGU_INT offset;
        offset = (index.x + (index.y * 4)) & 0x0F;
        return Texture[offset];
    };

    // Ignoring z in Texture2D load
    CGU_Vec4ui Load(CGU_Vec4ui index)
    {
        CGU_INT offset;
        offset = (index.x + (index.y * 4)) & 0x0F;
        // implicit conversion of float to uint
        CGU_Vec4ui res;
        res.x = Texture[offset].x;
        res.y = Texture[offset].y;
        res.z = Texture[offset].z;
        res.w = Texture[offset].w;
        return res;
    };
};

// matches GPU struct in HLSL
struct BufferShared
{
    CGU_Vec4ui pixel;
    CGU_UINT32 error;
    CGU_UINT32 mode;
    CGU_UINT32 partition;
    CGU_UINT32 index_selector;
    CGU_UINT32 rotation;
    CGU_UINT32 pbit;
    CGU_Vec4ui endPoint_low;
    CGU_Vec4ui endPoint_high;
    CGU_Vec4ui endPoint_low_quantized;
    CGU_Vec4ui endPoint_high_quantized;
    CGU_UINT32 colorindex;
    CGU_UINT32 alphaindex;
};

struct SharedIOData
{
    CGU_UINT32 error;
    CGU_UINT32 mode;
    CGU_UINT32 index_selector;
    CGU_UINT32 rotation;
    CGU_UINT32 partition;
    CGU_Vec4ui data2;
};

CMP_STATIC BufferShared shared_temp[THREAD_GROUP_SIZE];
CMP_STATIC Texture2D    g_Input;

// cbuffer input: On cpu will use 1 block
CMP_STATIC CGU_UINT32 g_tex_width;  // Not used in HLSLHost simulation code
CMP_STATIC CGU_UINT32 g_num_block_x = 1;
CMP_STATIC CGU_UINT32 g_format;  // Not used in HLSLHost simulation code
CMP_STATIC CGU_UINT32 g_mode_id        = 1;
CMP_STATIC CGU_UINT32 g_start_block_id = 0;
CMP_STATIC CGU_UINT32 g_num_total_blocks;
CMP_STATIC CGU_FLOAT  g_alpha_weight = 1.0f;
CMP_STATIC CGU_FLOAT  g_quality      = 1.0f;

CMP_STATIC SharedIOData g_InBuff[THREAD_GROUP_SIZE];
CMP_STATIC CGU_Vec4ui   g_OutBuff[THREAD_GROUP_SIZE];   // Used by EncodeBlocks & TryMode...
CMP_STATIC SharedIOData g_OutBuff1[THREAD_GROUP_SIZE];  // Used by TryMode...

// Forward definitions
void TryMode456CS(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID);
void TryMode137CS(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID);
void TryMode02CS(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID);
void EncodeBlocks(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID);

CMP_STATIC void HLSLHost(CGU_Vec4f image_src[16])
{
    //====================================
    // Simulate a single block CS
    //====================================
    // Load image_src
    CGU_Vec4ui imageBlock[16];
    for (CGU_INT i = 0; i < 16; i++)
    {
        g_Input.Texture[i].x = image_src[i].x / 255.0f;
        g_Input.Texture[i].y = image_src[i].y / 255.0f;
        g_Input.Texture[i].z = image_src[i].z / 255.0f;
        g_Input.Texture[i].w = image_src[i].w / 255.0f;
    }

    // Init global Buffers for first time use
    for (CGU_INT i = 0; i < THREAD_GROUP_SIZE; i++)
    {
        memset(&shared_temp[i], 0, sizeof(BufferShared));
        memset(&g_InBuff[i], 0, sizeof(SharedIOData));
        memset(&g_OutBuff1[i], 0, sizeof(SharedIOData));
    }

    // First Shader call
    CGU_Vec3ui SV_GroupID       = {0, 0, 0};  // = Dispatch (1..(n-1),1,1) where n = number of (4x4) blocks in the image;
    CGU_Vec3ui SV_GrounThreadID = {0, 0, 0};
    g_start_block_id            = 0;

    //  // Global Group Memory Sync for Pixel
    //  for (CGU_INT i = 0; i < 16; i++)
    //  {
    //      CGU_Vec4f px           = g_Input.Load(CGU_Vec3ui(i % 4, i / 4, 0));
    //      px                     = cmp_clampVec4f(px * 255.0f, 0.0f, 255.0f);
    //      //printf("in  px[%2d] %3.0f %3.0f %3.0f\n",i, px.x, px.y, px.z);
    //      shared_temp[i].pixel.r = (CGU_UINT32)px.r;
    //      shared_temp[i].pixel.g = (CGU_UINT32)px.g;
    //      shared_temp[i].pixel.b = (CGU_UINT32)px.b;
    //      shared_temp[i].pixel.a = (CGU_UINT32)px.a;
    //  }

    g_mode_id = 6;
    for (CGU_INT SV_GroupIndex = 15; SV_GroupIndex >= 0; SV_GroupIndex--)
    {
        TryMode456CS(SV_GroupIndex, SV_GroupID);
    }

    // Return Outbuff back to inbuff for next CS use
    for (CGU_INT i = 0; i < THREAD_GROUP_SIZE; i++)
    {
        memcpy(&g_InBuff[i], &g_OutBuff1[i], sizeof(SharedIOData));
    }

    // Global Group Memory Sync for Pixel
    //for (CGU_INT i = 0; i < 16; i++)
    //{
    //    CGU_Vec4f px           = g_Input.Load(CGU_Vec3ui(i % 4, i / 4, 0));
    //    px                     = cmp_clampVec4f(px * 255.0f, 0.0f, 255.0f);
    //    shared_temp[i].pixel.r = (CGU_UINT32)px.r;
    //    shared_temp[i].pixel.g = (CGU_UINT32)px.g;
    //    shared_temp[i].pixel.b = (CGU_UINT32)px.b;
    //    shared_temp[i].pixel.a = (CGU_UINT32)px.a;
    //}

    // Next Shader call
    g_mode_id = 1;
    for (CGU_INT SV_GroupIndex = 63; SV_GroupIndex >= 0; SV_GroupIndex--)
    {
        TryMode137CS(SV_GroupIndex, SV_GroupID);
    }

    // Return Outbuff back to inbuff for next shader call
    for (CGU_INT i = 0; i < THREAD_GROUP_SIZE; i++)
    {
        memcpy(&g_InBuff[i], &g_OutBuff1[i], sizeof(SharedIOData));
    }

    // Final Shader call
    for (CGU_INT SV_GroupIndex = 15; SV_GroupIndex >= 0; SV_GroupIndex--)
    {
        EncodeBlocks(SV_GroupIndex, SV_GroupID);
    }
}

#endif

#ifdef ENABLE_CMP_API

// Change this to CGU_Vec4ui par_vectors42_nd[4][2];
CMP_STATIC CMP_CONSTANT CGU_UINT32 par_vectors42_nd[4][2][4] = {
    // type = 2
    {{0, 0, 0, 0}, {0, 0, 0, 0}},  // 0  {0,0}
    {{0, 0, 0, 0}, {1, 1, 1, 1}},  // 1  {0,1}
    {{1, 1, 1, 1}, {0, 0, 0, 0}},  // 2  {1,0}
    {{1, 1, 1, 1}, {1, 1, 1, 1}}   // 3  {1,1}
};

#define COMP_RED 0
#define COMP_GREEN 1
#define COMP_BLUE 2
#define COMP_ALPHA 3

typedef struct
{
    CGU_UINT32 numPartitionModes;
    CGU_UINT32 maxSubSets;
    CGU_UINT32 channels3or4;
    CGU_UINT32 bits;
    CGU_UINT32 clusters;
    CGU_UINT32 componentBits;
    CGU_UINT32 partitionBits;
    CGU_UINT32 indexBits;
} MODESETTINGS;

CMP_STATIC CMP_CONSTANT MODESETTINGS g_modesettings[8] = {
    //   numPartitionModes,maxSubSets   channels3or4,   bits,   clusters,   componentBits,  partitionBits,  indexBits
    {16, 3, 3, 26, 8, 4, 4, 3},  // Mode 0
    {64, 2, 3, 37, 8, 6, 6, 3},  // Mode 1
    {64, 3, 3, 30, 4, 5, 6, 2},  // Mode 2
    {64, 2, 3, 44, 4, 7, 6, 2},  // Mode 3
    {0, 0, 0, 0, 0, 0, 0, 2},    // Mode 4
    {0, 0, 0, 0, 0, 0, 0, 2},    // Mode 5
    {0, 0, 4, 58, 16, 7, 0, 4},  // Mode 6
    {64, 2, 4, 42, 4, 5, 6, 2}   // Mode 7
};

#ifndef ASPM_HLSL  //=======================================================

CMP_STATIC CMP_CONSTANT CGU_UINT32 subset_mask_table2[128] = {
    // 2 subset region patterns
    0x0000CCCCu,  // 0   1100 1100 1100 1100  (MSB..LSB)
    0x00008888u,  // 1   1000 1000 1000 1000
    0x0000EEEEu,  // 2   1110 1110 1110 1110
    0x0000ECC8u,  // 3   1110 1100 1100 1000
    0x0000C880u,  // 4   1100 1000 1000 0000
    0x0000FEECu,  // 5   1111 1110 1110 1100
    0x0000FEC8u,  // 6   1111 1110 1100 1000
    0x0000EC80u,  // 7   1110 1100 1000 0000
    0x0000C800u,  // 8   1100 1000 0000 0000
    0x0000FFECu,  // 9   1111 1111 1110 1100
    0x0000FE80u,  // 10  1111 1110 1000 0000
    0x0000E800u,  // 11  1110 1000 0000 0000
    0x0000FFE8u,  // 12  1111 1111 1110 1000
    0x0000FF00u,  // 13  1111 1111 0000 0000
    0x0000FFF0u,  // 14  1111 1111 1111 0000
    0x0000F000u,  // 15  1111 0000 0000 0000
    0x0000F710u,  // 16  1111 0111 0001 0000
    0x0000008Eu,  // 17  0000 0000 1000 1110
    0x00007100u,  // 18  0111 0001 0000 0000
    0x000008CEu,  // 19  0000 1000 1100 1110
    0x0000008Cu,  // 20  0000 0000 1000 1100
    0x00007310u,  // 21  0111 0011 0001 0000
    0x00003100u,  // 22  0011 0001 0000 0000
    0x00008CCEu,  // 23  1000 1100 1100 1110
    0x0000088Cu,  // 24  0000 1000 1000 1100
    0x00003110u,  // 25  0011 0001 0001 0000
    0x00006666u,  // 26  0110 0110 0110 0110
    0x0000366Cu,  // 27  0011 0110 0110 1100
    0x000017E8u,  // 28  0001 0111 1110 1000
    0x00000FF0u,  // 29  0000 1111 1111 0000
    0x0000718Eu,  // 30  0111 0001 1000 1110
    0x0000399Cu,  // 31  0011 1001 1001 1100
    0x0000AAAAu,  // 32  1010 1010 1010 1010
    0x0000F0F0u,  // 33  1111 0000 1111 0000
    0x00005A5Au,  // 34  0101 1010 0101 1010
    0x000033CCu,  // 35  0011 0011 1100 1100
    0x00003C3Cu,  // 36  0011 1100 0011 1100
    0x000055AAu,  // 37  0101 0101 1010 1010
    0x00009696u,  // 38  1001 0110 1001 0110
    0x0000A55Au,  // 39  1010 0101 0101 1010
    0x000073CEu,  // 40  0111 0011 1100 1110
    0x000013C8u,  // 41  0001 0011 1100 1000
    0x0000324Cu,  // 42  0011 0010 0100 1100
    0x00003BDCu,  // 43  0011 1011 1101 1100
    0x00006996u,  // 44  0110 1001 1001 0110
    0x0000C33Cu,  // 45  1100 0011 0011 1100
    0x00009966u,  // 46  1001 1001 0110 0110
    0x00000660u,  // 47  0000 0110 0110 0000
    0x00000272u,  // 48  0000 0010 0111 0010
    0x000004E4u,  // 49  0000 0100 1110 0100
    0x00004E40u,  // 50  0100 1110 0100 0000
    0x00002720u,  // 51  0010 0111 0010 0000
    0x0000C936u,  // 52  1100 1001 0011 0110
    0x0000936Cu,  // 53  1001 0011 0110 1100
    0x000039C6u,  // 54  0011 1001 1100 0110
    0x0000639Cu,  // 55  0110 0011 1001 1100
    0x00009336u,  // 56  1001 0011 0011 0110
    0x00009CC6u,  // 57  1001 1100 1100 0110
    0x0000817Eu,  // 58  1000 0001 0111 1110
    0x0000E718u,  // 59  1110 0111 0001 1000
    0x0000CCF0u,  // 60  1100 1100 1111 0000
    0x00000FCCu,  // 61  0000 1111 1100 1100
    0x00007744u,  // 62  0111 0111 0100 0100
    0x0000EE22u,  // 63  1110 1110 0010 0010

    // 3 Subset region patterns
    0xF60008CCu,  // 0    1111 0110 0000 0000 : 0000 1000 1100 1100 = 2222122011001100 (MSB...LSB)
    0x73008CC8u,  // 1    0111 0011 0000 0000 : 1000 1100 1100 1000 = 1222112211001000
    0x3310CC80u,  // 2    0011 0011 0001 0000 : 1100 1100 1000 0000 = 1122112210020000
    0x00CEEC00u,  // 3    0000 0000 1100 1110 : 1110 1100 0000 0000 = 1110110022002220
    0xCC003300u,  // 4    1100 1100 0000 0000 : 0011 0011 0000 0000 = 2211221100000000
    0xCC0000CCu,  // 5    1100 1100 0000 0000 : 0000 0000 1100 1100 = 2200220011001100
    0x00CCFF00u,  // 6    0000 0000 1100 1100 : 1111 1111 0000 0000 = 1111111122002200
    0x3300CCCCu,  // 7    0011 0011 0000 0000 : 1100 1100 1100 1100 = 1122112211001100
    0xF0000F00u,  // 8    1111 0000 0000 0000 : 0000 1111 0000 0000 = 2222111100000000
    0xF0000FF0u,  // 9    1111 0000 0000 0000 : 0000 1111 1111 0000 = 2222111111110000
    0xFF0000F0u,  // 10   1111 1111 0000 0000 : 0000 0000 1111 0000 = 2222222211110000
    0x88884444u,  // 11   1000 1000 1000 1000 : 0100 0100 0100 0100 = 2100210021002100
    0x88886666u,  // 12   1000 1000 1000 1000 : 0110 0110 0110 0110 = 2110211021102110
    0xCCCC2222u,  // 13   1100 1100 1100 1100 : 0010 0010 0010 0010 = 2210221022102210
    0xEC80136Cu,  // 14   1110 1100 1000 0000 : 0001 0011 0110 1100 = 2221221121101100
    0x7310008Cu,  // 15   0111 0011 0001 0000 : 0000 0000 1000 1100 = 0222002210021100
    0xC80036C8u,  // 16   1100 1000 0000 0000 : 0011 0110 1100 1000 = 2211211011001000
    0x310008CEu,  // 17   0011 0001 0000 0000 : 0000 1000 1100 1110 = 0022100211001110
    0xCCC03330u,  // 18   1100 1100 1100 0000 : 0011 0011 0011 0000 = 2211221122110000
    0x0CCCF000u,  // 19   0000 1100 1100 1100 : 1111 0000 0000 0000 = 1111220022002200
    0xEE0000EEu,  // 20   1110 1110 0000 0000 : 0000 0000 1110 1110 = 2220222011101110
    0x77008888u,  // 21   0111 0111 0000 0000 : 1000 1000 1000 1000 = 1222122210001000
    0xCC0022C0u,  // 22   1100 1100 0000 0000 : 0010 0010 1100 0000 = 2210221011000000
    0x33004430u,  // 23   0011 0011 0000 0000 : 0100 0100 0011 0000 = 0122012200110000
    0x00CC0C22u,  // 24   0000 0000 1100 1100 : 0000 1100 0010 0010 = 0000110022102210
    0xFC880344u,  // 25   1111 1100 1000 1000 : 0000 0011 0100 0100 = 2222221121002100
    0x06606996u,  // 26   0000 0110 0110 0000 : 0110 1001 1001 0110 = 0110122112210110
    0x66009960u,  // 27   0110 0110 0000 0000 : 1001 1001 0110 0000 = 1221122101100000
    0xC88C0330u,  // 28   1100 1000 1000 1100 : 0000 0011 0011 0000 = 2200201120112200
    0xF9000066u,  // 29   1111 1001 0000 0000 : 0000 0000 0110 0110 = 2222200201100110
    0x0CC0C22Cu,  // 30   0000 1100 1100 0000 : 1100 0010 0010 1100 = 1100221022101100
    0x73108C00u,  // 31   0111 0011 0001 0000 : 1000 1100 0000 0000 = 1222112200020000
    0xEC801300u,  // 32   1110 1100 1000 0000 : 0001 0011 0000 0000 = 2221221120000000
    0x08CEC400u,  // 33   0000 1000 1100 1110 : 1100 0100 0000 0000 = 1100210022002220
    0xEC80004Cu,  // 34   1110 1100 1000 0000 : 0000 0000 0100 1100 = 2220220021001100
    0x44442222u,  // 35   0100 0100 0100 0100 : 0010 0010 0010 0010 = 0210021002100210
    0x0F0000F0u,  // 36   0000 1111 0000 0000 : 0000 0000 1111 0000 = 0000222211110000
    0x49242492u,  // 37   0100 1001 0010 0100 : 0010 0100 1001 0010 = 0210210210210210
    0x42942942u,  // 38   0100 0010 1001 0100 : 0010 1001 0100 0010 = 0210102121020210
    0x0C30C30Cu,  // 39   0000 1100 0011 0000 : 1100 0011 0000 1100 = 1100221100221100
    0x03C0C03Cu,  // 40   0000 0011 1100 0000 : 1100 0000 0011 1100 = 1100002222111100
    0xFF0000AAu,  // 41   1111 1111 0000 0000 : 0000 0000 1010 1010 = 2222222210101010
    0x5500AA00u,  // 42   0101 0101 0000 0000 : 1010 1010 0000 0000 = 1212121200000000
    0xCCCC3030u,  // 43   1100 1100 1100 1100 : 0011 0000 0011 0000 = 2211220022112200
    0x0C0CC0C0u,  // 44   0000 1100 0000 1100 : 1100 0000 1100 0000 = 1100220011002200
    0x66669090u,  // 45   0110 0110 0110 0110 : 1001 0000 1001 0000 = 1221022012210220
    0x0FF0A00Au,  // 46   0000 1111 1111 0000 : 1010 0000 0000 1010 = 1010222222221010
    0x5550AAA0u,  // 47   0101 0101 0101 0000 : 1010 1010 1010 0000 = 1212121212120000
    0xF0000AAAu,  // 48   1111 0000 0000 0000 : 0000 1010 1010 1010 = 2222101010101010
    0x0E0EE0E0u,  // 49   0000 1110 0000 1110 : 1110 0000 1110 0000 = 1110222011102220
    0x88887070u,  // 50   1000 1000 1000 1000 : 0111 0000 0111 0000 = 2111200021112000
    0x99906660u,  // 51   1001 1001 1001 0000 : 0110 0110 0110 0000 = 2112211221120000
    0xE00E0EE0u,  // 52   1110 0000 0000 1110 : 0000 1110 1110 0000 = 2220111011102220
    0x88880770u,  // 53   1000 1000 1000 1000 : 0000 0111 0111 0000 = 2000211121112000
    0xF0000666u,  // 54   1111 0000 0000 0000 : 0000 0110 0110 0110 = 2222011001100110
    0x99006600u,  // 55   1001 1001 0000 0000 : 0110 0110 0000 0000 = 2112211200000000
    0xFF000066u,  // 56   1111 1111 0000 0000 : 0000 0000 0110 0110 = 2222222201100110
    0xC00C0CC0u,  // 57   1100 0000 0000 1100 : 0000 1100 1100 0000 = 2200110011002200
    0xCCCC0330u,  // 58   1100 1100 1100 1100 : 0000 0011 0011 0000 = 2200221122112200
    0x90006000u,  // 59   1001 0000 0000 0000 : 0110 0000 0000 0000 = 2112000000000000
    0x08088080u,  // 60   0000 1000 0000 1000 : 1000 0000 1000 0000 = 1000200010002000
    0xEEEE1010u,  // 61   1110 1110 1110 1110 : 0001 0000 0001 0000 = 2221222022212220
    0xFFF0000Au,  // 62   1111 1111 1111 0000 : 0000 0000 0000 1010 = 2222222222221010
    0x731008CEu,  // 63   0111 0011 0001 0000 : 0000 1000 1100 1110 = 0222102211021110
};

CMP_STATIC CMP_CONSTANT CGU_UINT8 cmp_npv_nd[2][8] = {
    {1, 2, 4, 8, 16, 32, 0, 0},  // 3
    {1, 2, 4, 0, 0, 0, 0, 0}     // 4
};

CMP_STATIC CMP_CONSTANT CGU_UINT8 cmp_par_vectors_nd[2][8][64][2][4] = {
    {
        // 3D
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{1, 1, 1, 0}, {1, 1, 1, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {1, 1, 1, 0}}, {{1, 1, 1, 0}, {0, 0, 0, 0}}, {{1, 1, 1, 0}, {1, 1, 1, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{1, 1, 0, 0}, {1, 1, 0, 0}}, {{1, 0, 1, 0}, {1, 0, 1, 0}}, {{0, 1, 1, 0}, {0, 1, 1, 0}}, {{0, 0, 0, 0}, {1, 1, 1, 0}},
         {{1, 1, 1, 0}, {0, 0, 0, 0}}, {{0, 1, 0, 0}, {0, 1, 0, 0}}, {{1, 1, 1, 0}, {1, 1, 1, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{1, 1, 0, 0}, {0, 0, 0, 0}}, {{1, 0, 1, 0}, {0, 0, 0, 0}}, {{0, 1, 1, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {1, 1, 0, 0}},
         {{1, 1, 0, 0}, {1, 1, 0, 0}}, {{1, 0, 1, 0}, {1, 1, 0, 0}}, {{0, 1, 1, 0}, {1, 1, 0, 0}}, {{0, 0, 0, 0}, {1, 0, 1, 0}}, {{1, 1, 0, 0}, {1, 0, 1, 0}},
         {{1, 0, 1, 0}, {1, 0, 1, 0}}, {{0, 1, 1, 0}, {1, 0, 1, 0}}, {{0, 0, 0, 0}, {0, 1, 1, 0}}, {{1, 1, 0, 0}, {0, 1, 1, 0}}, {{1, 0, 1, 0}, {0, 1, 1, 0}},
         {{0, 1, 1, 0}, {0, 1, 1, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{1, 1, 0, 0}, {0, 0, 0, 0}}, {{1, 0, 1, 0}, {0, 0, 0, 0}}, {{0, 1, 1, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {1, 1, 0, 0}},
         {{1, 1, 0, 0}, {1, 1, 0, 0}}, {{1, 0, 1, 0}, {1, 1, 0, 0}}, {{0, 1, 1, 0}, {1, 1, 0, 0}}, {{0, 0, 0, 0}, {1, 0, 1, 0}}, {{1, 1, 0, 0}, {1, 0, 1, 0}},
         {{1, 0, 1, 0}, {1, 0, 1, 0}}, {{0, 1, 1, 0}, {1, 0, 1, 0}}, {{0, 0, 0, 0}, {0, 1, 1, 0}}, {{1, 1, 0, 0}, {0, 1, 1, 0}}, {{1, 0, 1, 0}, {0, 1, 1, 0}},
         {{0, 1, 1, 0}, {0, 1, 1, 0}}, {{1, 0, 0, 0}, {1, 1, 1, 0}}, {{0, 1, 0, 0}, {1, 1, 1, 0}}, {{0, 0, 1, 0}, {1, 1, 1, 0}}, {{1, 1, 1, 0}, {1, 1, 1, 0}},
         {{1, 0, 0, 0}, {0, 0, 1, 0}}, {{0, 1, 0, 0}, {0, 0, 1, 0}}, {{0, 0, 1, 0}, {0, 0, 1, 0}}, {{1, 1, 1, 0}, {0, 0, 1, 0}}, {{1, 0, 0, 0}, {1, 0, 0, 0}},
         {{0, 1, 0, 0}, {1, 0, 0, 0}}, {{0, 0, 1, 0}, {1, 0, 0, 0}}, {{1, 1, 1, 0}, {1, 0, 0, 0}}, {{1, 0, 0, 0}, {0, 1, 0, 0}}, {{0, 1, 0, 0}, {0, 1, 0, 0}},
         {{0, 0, 1, 0}, {0, 1, 0, 0}}, {{1, 1, 1, 0}, {0, 1, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
    },
    {
        // 4D
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{1, 1, 1, 1}, {1, 1, 1, 1}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {1, 1, 1, 1}}, {{1, 1, 1, 1}, {0, 0, 0, 0}}, {{1, 1, 1, 1}, {1, 1, 1, 1}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 1, 1, 1}}, {{0, 1, 1, 1}, {0, 0, 0, 0}}, {{0, 1, 1, 1}, {0, 1, 1, 1}}, {{1, 0, 0, 0}, {1, 0, 0, 0}},
         {{1, 0, 0, 0}, {1, 1, 1, 1}}, {{1, 1, 1, 1}, {1, 0, 0, 0}}, {{1, 1, 1, 1}, {1, 1, 1, 1}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 1, 1, 1}}, {{0, 1, 1, 1}, {0, 0, 0, 0}}, {{0, 1, 1, 1}, {0, 1, 1, 1}}, {{1, 0, 0, 0}, {1, 0, 0, 0}},
         {{1, 0, 0, 0}, {1, 1, 1, 1}}, {{1, 1, 1, 1}, {1, 0, 0, 0}}, {{1, 1, 1, 1}, {1, 1, 1, 1}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 1, 1}},
         {{0, 0, 1, 1}, {0, 0, 0, 0}}, {{0, 1, 0, 1}, {0, 1, 0, 1}}, {{1, 0, 0, 0}, {1, 0, 0, 0}}, {{1, 0, 0, 0}, {1, 0, 1, 1}}, {{1, 0, 1, 1}, {1, 0, 0, 0}},
         {{1, 1, 0, 1}, {1, 1, 0, 1}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
    },
};

CMP_STATIC CMP_CONSTANT CGU_UINT8 cmp_rampI[3][16] = {
    {0, 21, 43, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},           // 2 bit index
    {0, 9, 18, 27, 37, 46, 55, 64, 0, 0, 0, 0, 0, 0, 0, 0},        // 3 bit index
    {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64}  // 4 bit index
};

// The data is saved as a packed INT = (BC7_FIXUPINDEX1 << 4 + BC7_FIXUPINDEX2)
CMP_STATIC CMP_CONSTANT CGU_UINT32 CMPFIXUPINDEX[128] = {
    // 2 subset partitions 0..63
    0xf0u,
    0xf0u,
    0xf0u,
    0xf0u,
    0xf0u,
    0xf0u,
    0xf0u,
    0xf0u,
    0xf0u,
    0xf0u,
    0xf0u,
    0xf0u,
    0xf0u,
    0xf0u,
    0xf0u,
    0xf0u,
    0xf0u,
    0x20u,
    0x80u,
    0x20u,
    0x20u,
    0x80u,
    0x80u,
    0xf0u,
    0x20u,
    0x80u,
    0x20u,
    0x20u,
    0x80u,
    0x80u,
    0x20u,
    0x20u,
    0xf0u,
    0xf0u,
    0x60u,
    0x80u,
    0x20u,
    0x80u,
    0xf0u,
    0xf0u,
    0x20u,
    0x80u,
    0x20u,
    0x20u,
    0x20u,
    0xf0u,
    0xf0u,
    0x60u,
    0x60u,
    0x20u,
    0x60u,
    0x80u,
    0xf0u,
    0xf0u,
    0x20u,
    0x20u,
    0xf0u,
    0xf0u,
    0xf0u,
    0xf0u,
    0xf0u,
    0x20u,
    0x20u,
    0xf0u,

    // 3 subset partitions 64..128
    0x3fu,
    0x38u,
    0xf8u,
    0xf3u,
    0x8fu,
    0x3fu,
    0xf3u,
    0xf8u,
    0x8fu,
    0x8fu,
    0x6fu,
    0x6fu,
    0x6fu,
    0x5fu,
    0x3fu,
    0x38u,
    0x3fu,
    0x38u,
    0x8fu,
    0xf3u,
    0x3fu,
    0x38u,
    0x6fu,
    0xa8u,
    0x53u,
    0x8fu,
    0x86u,
    0x6au,
    0x8fu,
    0x5fu,
    0xfau,
    0xf8u,
    0x8fu,
    0xf3u,
    0x3fu,
    0x5au,
    0x6au,
    0xa8u,
    0x89u,
    0xfau,
    0xf6u,
    0x3fu,
    0xf8u,
    0x5fu,
    0xf3u,
    0xf6u,
    0xf6u,
    0xf8u,
    0x3fu,
    0xf3u,
    0x5fu,
    0x5fu,
    0x5fu,
    0x8fu,
    0x5fu,
    0xafu,
    0x5fu,
    0xafu,
    0x8fu,
    0xdfu,
    0xf3u,
    0xcfu,
    0x3fu,
    0x38u};

INLINE void cmp_get_fixuptable(CMP_INOUT CGU_UINT32 fixup[3], CGU_INT part_id)
{
    CGU_UINT32 skip_packed = CMPFIXUPINDEX[part_id];  // gather_int2(FIXUPINDEX, part_id);
    fixup[0]               = 0;
    fixup[1]               = skip_packed >> 4;
    fixup[2]               = skip_packed & 15;
}

INLINE CGU_UINT8 shift_right_epocode2(CMP_IN CGU_UINT8 v, CMP_IN CGU_INT bits)
{
    return v >> bits;  // (perf warning expected)
}

INLINE CGU_UINT8 expand_epocode2(CMP_IN CGU_UINT8 v, CMP_IN CGU_INT bits)
{
    CGU_UINT8 vv = v << (8 - bits);
    return vv + shift_right_epocode2(vv, bits);
}

INLINE CGV_FLOAT cmp_GetRamp(CMP_IN CGU_INT   index_bits,  // ramp bits Valid range 2..4
                             CMP_IN CGU_INT   bits,        // Component Valid range 5..8
                             CMP_IN CGU_INT   p1,          // 0..255
                             CMP_IN CGU_INT   p2,          // 0..255
                             CMP_IN CGU_UINT8 index)
{
    CGU_INT   e1    = expand_epocode2(p1, bits);
    CGU_INT   e2    = expand_epocode2(p2, bits);
    CGV_FLOAT ramp  = cmp_rampI[index_bits - 2][index] / 64.0F;
    CGV_FLOAT rampf = cmp_floor(e1 + ramp * (e2 - e1) + 0.5F);
    return rampf;
}

#if defined(USE_NEW_SP_ERR_IDX)

#ifndef ASPM_GPU
struct BC7_EncodeRamps2
{
    CGU_INT   ep_d[4][256];
    CGU_UINT8 sp_err[3 * 4 * 256 * 2 * 2 * 16];
    CGU_INT   sp_idx[3 * 4 * 256 * 2 * 2 * 16 * 2];
    CGU_BOOL  ramp_init;
};

BC7_EncodeRamps2 BC7EncodeRamps2;

#define LOG_CL_RANGE2 5
#define LOG_CL_BASE2 2
#define BIT_BASE2 5
#define BIT_RANGE2 9
#define BTT2(bits) (bits - BIT_BASE2)
#define CLT2(cl) (cl - LOG_CL_BASE2)
#define SOURCE_BLOCK_SIZE 16

CMP_CONSTANT CGU_FLOAT rampWeights2[5][SOURCE_BLOCK_SIZE] = {
    {0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f},  // 0 bit index
    {0.000000f,
     1.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f},  // 1 bit index
    {0.000000f,
     0.328125f,
     0.671875f,
     1.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f},  // 2 bit index
    {0.000000f,
     0.140625f,
     0.281250f,
     0.421875f,
     0.578125f,
     0.718750f,
     0.859375f,
     1.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f,
     0.000000f},  // 3 bit index
    {0.000000f,
     0.062500f,
     0.140625f,
     0.203125f,
     0.265625f,
     0.328125f,
     0.406250f,
     0.468750f,
     0.531250f,
     0.593750f,
     0.671875f,
     0.734375f,
     0.796875f,
     0.859375f,
     0.937500f,
     1.000000f}  // 4 bit index
};

CGU_INT old_expandbits(CGU_INT bits, CGU_INT v)
{
    return (v << (8 - bits) | v >> (2 * bits - 8));
}

void old_init_BC7ramps()
{
    CMP_STATIC CGU_BOOL g_rampsInitialized = FALSE;
    if (g_rampsInitialized == TRUE)
        return;
    g_rampsInitialized        = TRUE;
    BC7EncodeRamps2.ramp_init = TRUE;

    //bc7_isa(); ASPM_PRINT((" INIT Ramps\n"));

    CGU_INT bits;
    CGU_INT p1;
    CGU_INT p2;
    CGU_INT clogBC7;
    CGU_INT index;
    CGU_INT j;
    CGU_INT o1;
    CGU_INT o2;

    for (bits = BIT_BASE2; bits < BIT_RANGE2; bits++)
    {
        for (p1 = 0; p1 < (1 << bits); p1++)
        {
            BC7EncodeRamps2.ep_d[BTT2(bits)][p1] = old_expandbits(bits, p1);
        }  //p1
    }      //bits<BIT_RANGE

    for (clogBC7 = LOG_CL_BASE2; clogBC7 < LOG_CL_RANGE2; clogBC7++)
    {
        for (bits = BIT_BASE2; bits < BIT_RANGE2; bits++)
        {
            // SP_ERR_IDX : Init
            for (j = 0; j < 256; j++)
            {
                for (o1 = 0; o1 < 2; o1++)
                {
                    for (o2 = 0; o2 < 2; o2++)
                    {
                        for (index = 0; index < 16; index++)
                        {
                            BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) + (j * 2 * 2 * 16 * 2) +
                                                   (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0] = 0;
                            BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) + (j * 2 * 2 * 16 * 2) +
                                                   (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 1] = 255;
                            BC7EncodeRamps2.sp_err[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT2(bits) * 256 * 2 * 2 * 16) + (j * 2 * 2 * 16) + (o1 * 2 * 16) +
                                                   (o2 * 16) + index]                                   = 255;
                        }  // i<16
                    }      //o2<2;
                }          //o1<2
            }              //j<256

            // SP_ERR_IDX : calc
            for (p1 = 0; p1 < (1 << bits); p1++)
            {
                for (p2 = 0; p2 < (1 << bits); p2++)
                {
                    for (index = 0; index < (1 << clogBC7); index++)
                    {
                        CGV_INT floatf = cmp_floor(
                            (CGV_FLOAT)BC7EncodeRamps2.ep_d[BTT2(bits)][p1] +
                            rampWeights2[clogBC7][index] * (CGV_FLOAT)((BC7EncodeRamps2.ep_d[BTT2(bits)][p2] - BC7EncodeRamps2.ep_d[BTT2(bits)][p1])) + 0.5F);
                        BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) + (floatf * 2 * 2 * 16 * 2) +
                                               ((p1 & 0x1) * 2 * 16 * 2) + ((p2 & 0x1) * 16 * 2) + (index * 2) + 0] = p1;
                        BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) + (floatf * 2 * 2 * 16 * 2) +
                                               ((p1 & 0x1) * 2 * 16 * 2) + ((p2 & 0x1) * 16 * 2) + (index * 2) + 1] = p2;
                        BC7EncodeRamps2.sp_err[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT2(bits) * 256 * 2 * 2 * 16) + (floatf * 2 * 2 * 16) +
                                               ((p1 & 0x1) * 2 * 16) + (p2 & 0x1 * 16) + index]                     = 0;
                    }  //i<(1 << clogBC7)
                }      //p2
            }          //p1<(1 << bits)

            for (j = 0; j < 256; j++)
            {
                for (o1 = 0; o1 < 2; o1++)
                {
                    for (o2 = 0; o2 < 2; o2++)
                    {
                        for (index = 0; index < (1 << clogBC7); index++)
                        {
                            if (  // check for unitialized sp_idx
                                (BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) +
                                                        (j * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0] == 0) &&
                                (BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) +
                                                        (j * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 1] == 255))

                            {
                                CGU_INT k;
                                CGU_INT tf;
                                CGU_INT tc;

                                for (k = 1; k < 256; k++)
                                {
                                    tf = j - k;
                                    tc = j + k;
                                    if ((tf >= 0 && BC7EncodeRamps2.sp_err[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT2(bits) * 256 * 2 * 2 * 16) +
                                                                           (tf * 2 * 2 * 16) + (o1 * 2 * 16) + (o2 * 16) + index] == 0))
                                    {
                                        BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) +
                                                               (j * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0] =
                                            BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) +
                                                                   (tf * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0];
                                        BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) +
                                                               (j * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 1] =
                                            BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) +
                                                                   (tf * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 1];
                                        break;
                                    }
                                    else if ((tc < 256 && BC7EncodeRamps2.sp_err[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT2(bits) * 256 * 2 * 2 * 16) +
                                                                                 (tc * 2 * 2 * 16) + (o1 * 2 * 16) + (o2 * 16) + index] == 0))
                                    {
                                        BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) +
                                                               (j * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0] =
                                            BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) +
                                                                   (tc * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0];
                                        break;
                                    }
                                }

                                BC7EncodeRamps2.sp_err[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT2(bits) * 256 * 2 * 2 * 16) + (j * 2 * 2 * 16) +
                                                       (o1 * 2 * 16) + (o2 * 16) + index] = (CGU_UINT8)k;

                            }  //sp_idx < 0
                        }      //i<(1 << clogBC7)
                    }          //o2
                }              //o1
            }                  //j

        }  //bits<BIT_RANGE
    }      //clogBC7<LOG_CL_RANGE
}

CGV_FLOAT old_img_absf(CGV_FLOAT a)
{
    return a > 0.0F ? a : -a;
}

INLINE CGV_FLOAT old_get_sperr(CGU_INT   clogBC7,  // ramp bits Valid range 2..4
                               CGU_INT   bits,     // Component Valid range 5..8
                               CGV_INT   p1,       // 0..255
                               CGU_INT   t1,
                               CGU_INT   t2,
                               CGV_UINT8 index)
{
    if (BC7EncodeRamps2.ramp_init)
        return BC7EncodeRamps2
            .sp_err[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT2(bits) * 256 * 2 * 2 * 16) + (p1 * 2 * 2 * 16) + (t1 * 2 * 16) + (t2 * 16) + index];
    else
        return 0.0f;
}
#endif

#endif

#endif  // Not ASPM_HLSL

#endif  // ENABLE_CMP_API

#define get_end_point_l(subset) shared_temp[threadBase + subset].endPoint_low_quantized
#define get_end_point_h(subset) shared_temp[threadBase + subset].endPoint_high_quantized
#define get_color_index(index) shared_temp[threadBase + index].error
#define get_alpha_index(index) shared_temp[threadBase + index].mode

//4 bit index: 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64
CMP_STATIC CMP_CONSTANT CGU_UINT32 aStep[3][64] = {
    {0, 0, 0, 1, 1, 1, 1, 2, 2, 2,  2,  2,  3,  3,  3,  3,  4,  4,  4,  4,  5,  5,  5,  5,  6,  6,  6,  6,  6,  7,  7,  7,
     7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15},
    //3 bit index: 0, 9, 18, 27, 37, 46, 55, 64
    {0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
     3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7},
    //2 bit index: 0, 21, 43, 64
    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}};

CMP_STATIC CMP_CONSTANT CGU_UINT32 aWeight[3][16] = {{0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64},
                                                     {0, 9, 18, 27, 37, 46, 55, 64, 0, 0, 0, 0, 0, 0, 0, 0},
                                                     {0, 21, 43, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}};

//Associated to partition 0-63
CMP_STATIC CMP_CONSTANT CGU_UINT32 blockPartitions[64] = {
    0xCCCC, 0x8888, 0xEEEE, 0xECC8, 0xC880, 0xFEEC, 0xFEC8, 0xEC80, 0xC800, 0xFFEC, 0xFE80, 0xE800, 0xFFE8, 0xFF00, 0xFFF0, 0xF000,
    0xF710, 0x008E, 0x7100, 0x08CE, 0x008C, 0x7310, 0x3100, 0x8CCE, 0x088C, 0x3110, 0x6666, 0x366C, 0x17E8, 0x0FF0, 0x718E, 0x399C,
    0xaaaa, 0xf0f0, 0x5a5a, 0x33cc, 0x3c3c, 0x55aa, 0x9696, 0xa55a, 0x73ce, 0x13c8, 0x324c, 0x3bdc, 0x6996, 0xc33c, 0x9966, 0x660,
    0x272,  0x4e4,  0x4e40, 0x2720, 0xc936, 0x936c, 0x39c6, 0x639c, 0x9336, 0x9cc6, 0x817e, 0xe718, 0xccf0, 0xfcc,  0x7744, 0xee22,
};

//Associated to partition 64-127
CMP_STATIC CMP_CONSTANT CGU_UINT32 blockPartitions2[64] = {
    0xaa685050, 0x6a5a5040, 0x5a5a4200, 0x5450a0a8, 0xa5a50000, 0xa0a05050, 0x5555a0a0, 0x5a5a5050, 0xaa550000, 0xaa555500, 0xaaaa5500, 0x90909090, 0x94949494,
    0xa4a4a4a4, 0xa9a59450, 0x2a0a4250, 0xa5945040, 0x0a425054, 0xa5a5a500, 0x55a0a0a0, 0xa8a85454, 0x6a6a4040, 0xa4a45000, 0x1a1a0500, 0x0050a4a4, 0xaaa59090,
    0x14696914, 0x69691400, 0xa08585a0, 0xaa821414, 0x50a4a450, 0x6a5a0200, 0xa9a58000, 0x5090a0a8, 0xa8a09050, 0x24242424, 0x00aa5500, 0x24924924, 0x24499224,
    0x50a50a50, 0x500aa550, 0xaaaa4444, 0x66660000, 0xa5a0a5a0, 0x50a050a0, 0x69286928, 0x44aaaa44, 0x66666600, 0xaa444444, 0x54a854a8, 0x95809580, 0x96969600,
    0xa85454a8, 0x80959580, 0xaa141414, 0x96960000, 0xaaaa1414, 0xa05050a0, 0xa0a5a5a0, 0x96000000, 0x40804080, 0xa9a8a9a8, 0xaaaaaa44, 0x2a4a5254,
};

CMP_STATIC CMP_CONSTANT CGU_Vec2ui candidateFixUpIndex1D[128] = {
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {2, 0},
    {8, 0},
    {2, 0},
    {2, 0},
    {8, 0},
    {8, 0},
    {15, 0},
    {2, 0},
    {8, 0},
    {2, 0},
    {2, 0},
    {8, 0},
    {8, 0},
    {2, 0},
    {2, 0},

    {15, 0},
    {15, 0},
    {6, 0},
    {8, 0},
    {2, 0},
    {8, 0},
    {15, 0},
    {15, 0},
    {2, 0},
    {8, 0},
    {2, 0},
    {2, 0},
    {2, 0},
    {15, 0},
    {15, 0},
    {6, 0},
    {6, 0},
    {2, 0},
    {6, 0},
    {8, 0},
    {15, 0},
    {15, 0},
    {2, 0},
    {2, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {2, 0},
    {2, 0},
    {15, 0},
    //candidateFixUpIndex1D[i][1], i < 64 should not be used

    {3, 15},
    {3, 8},
    {15, 8},
    {15, 3},
    {8, 15},
    {3, 15},
    {15, 3},
    {15, 8},
    {8, 15},
    {8, 15},
    {6, 15},
    {6, 15},
    {6, 15},
    {5, 15},
    {3, 15},
    {3, 8},
    {3, 15},
    {3, 8},
    {8, 15},
    {15, 3},
    {3, 15},
    {3, 8},
    {6, 15},
    {10, 8},
    {5, 3},
    {8, 15},
    {8, 6},
    {6, 10},
    {8, 15},
    {5, 15},
    {15, 10},
    {15, 8},

    {8, 15},
    {15, 3},
    {3, 15},
    {5, 10},
    {6, 10},
    {10, 8},
    {8, 9},
    {15, 10},
    {15, 6},
    {3, 15},
    {15, 8},
    {5, 15},
    {15, 3},
    {15, 6},
    {15, 6},
    {15, 8},  //The Spec doesn't mark the first fixed up index in this row, so I apply 15 for them, and seems correct
    {3, 15},
    {15, 3},
    {5, 15},
    {5, 15},
    {5, 15},
    {8, 15},
    {5, 15},
    {10, 15},
    {5, 15},
    {10, 15},
    {8, 15},
    {13, 15},
    {15, 3},
    {12, 15},
    {3, 15},
    {3, 8},
};

CMP_STATIC CMP_CONSTANT CGU_Vec2ui candidateFixUpIndex1DOrdered[128] = {
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {2, 0},
    {8, 0},
    {2, 0},
    {2, 0},
    {8, 0},
    {8, 0},
    {15, 0},
    {2, 0},
    {8, 0},
    {2, 0},
    {2, 0},
    {8, 0},
    {8, 0},
    {2, 0},
    {2, 0},

    {15, 0},
    {15, 0},
    {6, 0},
    {8, 0},
    {2, 0},
    {8, 0},
    {15, 0},
    {15, 0},
    {2, 0},
    {8, 0},
    {2, 0},
    {2, 0},
    {2, 0},
    {15, 0},
    {15, 0},
    {6, 0},
    {6, 0},
    {2, 0},
    {6, 0},
    {8, 0},
    {15, 0},
    {15, 0},
    {2, 0},
    {2, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {15, 0},
    {2, 0},
    {2, 0},
    {15, 0},
    //candidateFixUpIndex1DOrdered[i][1], i < 64 should not be used

    {3, 15},
    {3, 8},
    {8, 15},
    {3, 15},
    {8, 15},
    {3, 15},
    {3, 15},
    {8, 15},
    {8, 15},
    {8, 15},
    {6, 15},
    {6, 15},
    {6, 15},
    {5, 15},
    {3, 15},
    {3, 8},
    {3, 15},
    {3, 8},
    {8, 15},
    {3, 15},
    {3, 15},
    {3, 8},
    {6, 15},
    {8, 10},
    {3, 5},
    {8, 15},
    {6, 8},
    {6, 10},
    {8, 15},
    {5, 15},
    {10, 15},
    {8, 15},

    {8, 15},
    {3, 15},
    {3, 15},
    {5, 10},
    {6, 10},
    {8, 10},
    {8, 9},
    {10, 15},
    {6, 15},
    {3, 15},
    {8, 15},
    {5, 15},
    {3, 15},
    {6, 15},
    {6, 15},
    {8, 15},  //The Spec doesn't mark the first fixed up index in this row, so I apply 15 for them, and seems correct
    {3, 15},
    {3, 15},
    {5, 15},
    {5, 15},
    {5, 15},
    {8, 15},
    {5, 15},
    {10, 15},
    {5, 15},
    {10, 15},
    {8, 15},
    {13, 15},
    {3, 15},
    {12, 15},
    {3, 15},
    {3, 8}};

CGU_Vec4ui quantize(CGU_Vec4ui color, CGU_UINT32 uPrec)
{
    return (((color << 8) + color) * ((1 << uPrec) - 1) + 32768U) >> 16;
}

CGU_Vec4ui unquantize(CGU_Vec4ui color, CGU_UINT32 uPrec)
{
#ifdef ASPM_GPU
    color = color << (8 - uPrec);
    return color | (color >> uPrec);
#else
    CGU_Vec4ui res;
    color.x = color.x << (8 - uPrec);
    color.y = color.y << (8 - uPrec);
    color.z = color.z << (8 - uPrec);
    color.w = color.w << (8 - uPrec);
    res.x   = color.x | (color.x >> uPrec);
    res.y   = color.y | (color.y >> uPrec);
    res.z   = color.z | (color.z >> uPrec);
    res.w   = color.w | (color.w >> uPrec);
    return res;
#endif
}

void swap(CMP_INOUT CGU_Vec4ui CMP_REFINOUT lhs, CMP_INOUT CGU_Vec4ui CMP_REFINOUT rhs)
{
    CGU_Vec4ui tmp = lhs;
    lhs            = rhs;
    rhs            = tmp;
}

void swap(CMP_INOUT CGU_Vec3ui CMP_REFINOUT lhs, CMP_INOUT CGU_Vec3ui CMP_REFINOUT rhs)
{
    CGU_Vec3ui tmp = lhs;
    lhs            = rhs;
    rhs            = tmp;
}

void swap(CMP_INOUT CGU_UINT32 CMP_REFINOUT lhs, CMP_INOUT CGU_UINT32 CMP_REFINOUT rhs)
{
    CGU_UINT32 tmp = lhs;
    lhs            = rhs;
    rhs            = tmp;
}

CGU_UINT32 ComputeError(CMP_IN CGU_Vec4ui a, CMP_IN CGU_Vec4ui b)
{
    return dot(a.rgb, b.rgb) + (g_alpha_weight * a.a * b.a);
}

void Ensure_A_Is_Larger(CMP_INOUT CGU_Vec4ui CMP_REFINOUT a, CMP_INOUT CGU_Vec4ui CMP_REFINOUT b)
{
    if (a.x < b.x)
        swap(a.x, b.x);
    if (a.y < b.y)
        swap(a.y, b.y);
    if (a.z < b.z)
        swap(a.z, b.z);
    if (a.w < b.w)
        swap(a.w, b.w);
}

void compress_endpoints0(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_INOUT CGU_Vec4ui quantized[2], CGU_Vec2ui P)
{
#ifdef ASPM_GPU
    CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
    {
        quantized[j].rgb = quantize(endPoint[j].rgbb, 5).rgb & 0xFFFFFFFE;
        quantized[j].rgb |= P[j];
        quantized[j].a = 0xFF;

        endPoint[j].rgb = unquantize(quantized[j].rgbb, 5).rgb;
        endPoint[j].a   = 0xFF;

        quantized[j] <<= 3;
    }
#else
    CGU_Vec4ui rgbb;
    CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
    {
        rgbb.r = endPoint[j].r;
        rgbb.g = endPoint[j].g;
        rgbb.b = endPoint[j].b;
        rgbb.a = endPoint[j].b;

        quantized[j].rgb = quantize(rgbb, 5).rgb;
        quantized[j].r &= 0xFFFFFFFE;
        quantized[j].g &= 0xFFFFFFFE;
        quantized[j].b &= 0xFFFFFFFE;

        quantized[j].r |= P[j];
        quantized[j].g |= P[j];
        quantized[j].b |= P[j];
        quantized[j].a = 0xFF;

        rgbb.r = quantized[j].r;
        rgbb.g = quantized[j].g;
        rgbb.b = quantized[j].b;
        rgbb.a = quantized[j].b;

        endPoint[j].rgb = unquantize(rgbb, 5).rgb;
        endPoint[j].a   = 0xFF;

        quantized[j].r <<= 3;
        quantized[j].g <<= 3;
        quantized[j].b <<= 3;
        quantized[j].a <<= 3;
    }
#endif
}

void compress_endpoints1(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_OUT CGU_Vec4ui quantized[2], CGU_Vec2ui P)
{
#ifdef ASPM_GPU
    CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
    {
        quantized[j].rgb = quantize(endPoint[j].rgbb, 7).rgb & 0xFFFFFFFE;
        quantized[j].rgb |= P[j];
        quantized[j].a  = 0xFF;
        endPoint[j].rgb = unquantize(quantized[j].rgbb, 7).rgb;
        endPoint[j].a   = 0xFF;
        quantized[j] <<= 1;
    }
#else
    CGU_Vec4ui rgbb;
    CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
    {
        rgbb.r           = endPoint[j].r;
        rgbb.g           = endPoint[j].g;
        rgbb.b           = endPoint[j].b;
        rgbb.a           = endPoint[j].b;
        quantized[j].rgb = quantize(rgbb, 7).rgb;

        quantized[j].r &= 0xFFFFFFFE;
        quantized[j].g &= 0xFFFFFFFE;
        quantized[j].b &= 0xFFFFFFFE;

        quantized[j].r |= P[j];
        quantized[j].g |= P[j];
        quantized[j].b |= P[j];
        quantized[j].a = 0xFF;

        rgbb.r          = quantized[j].r;
        rgbb.g          = quantized[j].g;
        rgbb.b          = quantized[j].b;
        rgbb.a          = quantized[j].b;
        endPoint[j].rgb = unquantize(rgbb, 7).rgb;
        endPoint[j].a   = 0xFF;
        quantized[j].r  = quantized[j].r << 1;
        quantized[j].g  = quantized[j].g << 1;
        quantized[j].b  = quantized[j].b << 1;
        quantized[j].a  = quantized[j].a << 1;
    }
#endif
}

void compress_endpoints2(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_INOUT CGU_Vec4ui quantized[2])
{
#ifdef ASPM_GPU
    CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
    {
        quantized[j].rgb = quantize(endPoint[j].rgbb, 5).rgb;
        quantized[j].a   = 0xFF;

        endPoint[j].rgb = unquantize(quantized[j].rgbb, 5).rgb;
        endPoint[j].a   = 0xFF;

        quantized[j] <<= 3;
    }
#else
    CGU_Vec4ui rgbb;
    CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
    {
        rgbb.r           = endPoint[j].r;
        rgbb.g           = endPoint[j].g;
        rgbb.b           = endPoint[j].b;
        rgbb.a           = endPoint[j].b;
        quantized[j].rgb = quantize(rgbb, 5).rgb;
        quantized[j].a   = 0xFF;

        rgbb.r = quantized[j].r;
        rgbb.g = quantized[j].g;
        rgbb.b = quantized[j].b;
        rgbb.a = quantized[j].b;

        endPoint[j].rgb = unquantize(rgbb, 5).rgb;
        endPoint[j].a   = 0xFF;

        quantized[j].r <<= 3;
        quantized[j].g <<= 3;
        quantized[j].b <<= 3;
        quantized[j].a <<= 3;
    }
#endif
}

void compress_endpoints3(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_INOUT CGU_Vec4ui quantized[2], CGU_Vec2ui P)
{
    for (CGU_UINT32 j = 0; j < 2; j++)
    {
        quantized[j].r = endPoint[j].x & 0xFFFFFFFE;
        quantized[j].g = endPoint[j].y & 0xFFFFFFFE;
        quantized[j].b = endPoint[j].z & 0xFFFFFFFE;
        quantized[j].a = 0xFF;

        quantized[j].r |= P[j];
        quantized[j].g |= P[j];
        quantized[j].b |= P[j];

        endPoint[j].r = quantized[j].r;
        endPoint[j].g = quantized[j].g;
        endPoint[j].b = quantized[j].b;
        endPoint[j].a = 0xFF;
    }
}

void compress_endpoints4(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_INOUT CGU_Vec4ui quantized[2])
{
#ifdef ASPM_HLSL
    [unroll] for (uint j = 0; j < 2; j++)
    {
        quantized[j].rgb = quantize(endPoint[j].rgbb, 5).rgb;
        quantized[j].a   = quantize(endPoint[j].a, 6).r;

        endPoint[j].rgb = unquantize(quantized[j].rgbb, 5).rgb;
        endPoint[j].a   = unquantize(quantized[j].a, 6).r;

        quantized[j].rgb <<= 3;
        quantized[j].a <<= 2;
    }
#else
    CGU_Vec4ui rgbb;
    CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
    {
        rgbb.r           = endPoint[j].r;
        rgbb.g           = endPoint[j].g;
        rgbb.b           = endPoint[j].b;
        rgbb.a           = endPoint[j].b;
        quantized[j].rgb = quantize(rgbb, 5).rgb;
        quantized[j].a   = quantize(endPoint[j].a, 6).r;

        rgbb.r          = quantized[j].r;
        rgbb.g          = quantized[j].g;
        rgbb.b          = quantized[j].b;
        rgbb.a          = quantized[j].b;
        endPoint[j].rgb = unquantize(rgbb, 5).rgb;
        endPoint[j].a   = unquantize(quantized[j].a, 6).r;

        quantized[j].r <<= 3;
        quantized[j].g <<= 3;
        quantized[j].b <<= 3;
        quantized[j].a <<= 2;
    }
#endif
}

void compress_endpoints5(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_INOUT CGU_Vec4ui quantized[2])
{
#ifdef ASPM_HLSL
    CMP_UNROLL for (uint j = 0; j < 2; j++)
    {
        quantized[j].rgb = quantize(endPoint[j].rgbb, 7).rgb;
        quantized[j].a   = endPoint[j].a;

        endPoint[j].rgb = unquantize(quantized[j].rgbb, 7).rgb;
        // endPoint[j].a   Alpha is full precision

        quantized[j].rgb <<= 1;
    }
#else
    CGU_Vec4ui rgbb;
    CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
    {
        rgbb.r           = endPoint[j].r;
        rgbb.g           = endPoint[j].g;
        rgbb.b           = endPoint[j].b;
        rgbb.a           = endPoint[j].b;
        quantized[j].rgb = quantize(rgbb, 7).rgb;
        quantized[j].a   = endPoint[j].a;
        rgbb.r           = quantized[j].r;
        rgbb.g           = quantized[j].g;
        rgbb.b           = quantized[j].b;
        rgbb.a           = quantized[j].b;
        endPoint[j].rgb  = unquantize(rgbb, 7).rgb;
        quantized[j].r <<= 1;
        quantized[j].g <<= 1;
        quantized[j].b <<= 1;
    }
#endif
}

void compress_endpoints6(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_OUT CGU_Vec4ui quantized[2], CGU_Vec2ui P)
{
    for (CGU_UINT32 j = 0; j < 2; j++)
    {
        quantized[j].x = endPoint[j].x & 0xFFFFFFFE;
        quantized[j].y = endPoint[j].y & 0xFFFFFFFE;
        quantized[j].z = endPoint[j].z & 0xFFFFFFFE;
        quantized[j].w = endPoint[j].w & 0xFFFFFFFE;
        quantized[j].x = quantized[j].x | P[j];
        quantized[j].y = quantized[j].y | P[j];
        quantized[j].z = quantized[j].z | P[j];
        quantized[j].w = quantized[j].w | P[j];
        endPoint[j]    = quantized[j];
    }
}

void compress_endpoints7(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_INOUT CGU_Vec4ui quantized[2], CGU_Vec2ui P)
{
    CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
    {
        quantized[j] = quantize(endPoint[j], 6);

        quantized[j].x = (quantized[j].x & 0xFFFFFFFE) | P[j];
        quantized[j].y = (quantized[j].y & 0xFFFFFFFE) | P[j];
        quantized[j].z = (quantized[j].z & 0xFFFFFFFE) | P[j];
        quantized[j].w = (quantized[j].w & 0xFFFFFFFE) | P[j];

        endPoint[j] = unquantize(quantized[j], 6);
    }

    CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
    {
        quantized[j].x = quantized[j].x << 2;
        quantized[j].y = quantized[j].y << 2;
        quantized[j].z = quantized[j].z << 2;
        quantized[j].w = quantized[j].w << 2;
    }
}

void block_package0(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 partition, CGU_UINT32 threadBase)
{
    block.x = 0x01 | ((partition - 64) << 1) | ((get_end_point_l(0).r & 0xF0) << 1) | ((get_end_point_h(0).r & 0xF0) << 5) |
              ((get_end_point_l(1).r & 0xF0) << 9) | ((get_end_point_h(1).r & 0xF0) << 13) | ((get_end_point_l(2).r & 0xF0) << 17) |
              ((get_end_point_h(2).r & 0xF0) << 21) | ((get_end_point_l(0).g & 0xF0) << 25);
    block.y = ((get_end_point_l(0).g & 0xF0) >> 7) | ((get_end_point_h(0).g & 0xF0) >> 3) | ((get_end_point_l(1).g & 0xF0) << 1) |
              ((get_end_point_h(1).g & 0xF0) << 5) | ((get_end_point_l(2).g & 0xF0) << 9) | ((get_end_point_h(2).g & 0xF0) << 13) |
              ((get_end_point_l(0).b & 0xF0) << 17) | ((get_end_point_h(0).b & 0xF0) << 21) | ((get_end_point_l(1).b & 0xF0) << 25);
    block.z = ((get_end_point_l(1).b & 0xF0) >> 7) | ((get_end_point_h(1).b & 0xF0) >> 3) | ((get_end_point_l(2).b & 0xF0) << 1) |
              ((get_end_point_h(2).b & 0xF0) << 5) | ((get_end_point_l(0).r & 0x08) << 10) | ((get_end_point_h(0).r & 0x08) << 11) |
              ((get_end_point_l(1).r & 0x08) << 12) | ((get_end_point_h(1).r & 0x08) << 13) | ((get_end_point_l(2).r & 0x08) << 14) |
              ((get_end_point_h(2).r & 0x08) << 15) | (get_color_index(0) << 19);
    block.w      = 0;
    CGU_UINT32 i = 1;
    for (; i <= cmp_min(candidateFixUpIndex1DOrdered[partition][0], 4); i++)
    {
        block.z |= get_color_index(i) << (i * 3 + 18);
    }
    if (candidateFixUpIndex1DOrdered[partition][0] < 4)  //i = 4
    {
        block.z |= get_color_index(4) << 29;
        i += 1;
    }
    else  //i = 5
    {
        block.w |= (get_color_index(4) & 0x04) >> 2;
        for (; i <= candidateFixUpIndex1DOrdered[partition][0]; i++)
            block.w |= get_color_index(i) << (i * 3 - 14);
    }
    for (; i <= candidateFixUpIndex1DOrdered[partition][1]; i++)
    {
        block.w |= get_color_index(i) << (i * 3 - 15);
    }
    for (; i < 16; i++)
    {
        block.w |= get_color_index(i) << (i * 3 - 16);
    }
}

void block_package1(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 partition, CGU_UINT32 threadBase)
{
    block.x = 0x02 | (partition << 2) | ((get_end_point_l(0).r & 0xFC) << 6) | ((get_end_point_h(0).r & 0xFC) << 12) | ((get_end_point_l(1).r & 0xFC) << 18) |
              ((get_end_point_h(1).r & 0xFC) << 24);
    block.y = ((get_end_point_l(0).g & 0xFC) >> 2) | ((get_end_point_h(0).g & 0xFC) << 4) | ((get_end_point_l(1).g & 0xFC) << 10) |
              ((get_end_point_h(1).g & 0xFC) << 16) | ((get_end_point_l(0).b & 0xFC) << 22) | ((get_end_point_h(0).b & 0xFC) << 28);
    block.z = ((get_end_point_h(0).b & 0xFC) >> 4) | ((get_end_point_l(1).b & 0xFC) << 2) | ((get_end_point_h(1).b & 0xFC) << 8) |
              ((get_end_point_l(0).r & 0x02) << 15) | ((get_end_point_l(1).r & 0x02) << 16) | (get_color_index(0) << 18);
    if (candidateFixUpIndex1DOrdered[partition][0] == 15)
    {
        block.w = (get_color_index(15) << 30) | (get_color_index(14) << 27) | (get_color_index(13) << 24) | (get_color_index(12) << 21) |
                  (get_color_index(11) << 18) | (get_color_index(10) << 15) | (get_color_index(9) << 12) | (get_color_index(8) << 9) |
                  (get_color_index(7) << 6) | (get_color_index(6) << 3) | get_color_index(5);
        block.z |=
            (get_color_index(4) << 29) | (get_color_index(3) << 26) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18);
    }
    else if (candidateFixUpIndex1DOrdered[partition][0] == 2)
    {
        block.w = (get_color_index(15) << 29) | (get_color_index(14) << 26) | (get_color_index(13) << 23) | (get_color_index(12) << 20) |
                  (get_color_index(11) << 17) | (get_color_index(10) << 14) | (get_color_index(9) << 11) | (get_color_index(8) << 8) |
                  (get_color_index(7) << 5) | (get_color_index(6) << 2) | (get_color_index(5) >> 1);
        block.z |= (get_color_index(5) << 31) | (get_color_index(4) << 28) | (get_color_index(3) << 25) | (get_color_index(2) << 23) |
                   (get_color_index(1) << 20) | (get_color_index(0) << 18);
    }
    else if (candidateFixUpIndex1DOrdered[partition][0] == 8)
    {
        block.w = (get_color_index(15) << 29) | (get_color_index(14) << 26) | (get_color_index(13) << 23) | (get_color_index(12) << 20) |
                  (get_color_index(11) << 17) | (get_color_index(10) << 14) | (get_color_index(9) << 11) | (get_color_index(8) << 9) |
                  (get_color_index(7) << 6) | (get_color_index(6) << 3) | get_color_index(5);
        block.z |=
            (get_color_index(4) << 29) | (get_color_index(3) << 26) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18);
    }
    else  //candidateFixUpIndex1DOrdered[partition] == 6
    {
        block.w = (get_color_index(15) << 29) | (get_color_index(14) << 26) | (get_color_index(13) << 23) | (get_color_index(12) << 20) |
                  (get_color_index(11) << 17) | (get_color_index(10) << 14) | (get_color_index(9) << 11) | (get_color_index(8) << 8) |
                  (get_color_index(7) << 5) | (get_color_index(6) << 3) | get_color_index(5);
        block.z |=
            (get_color_index(4) << 29) | (get_color_index(3) << 26) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18);
    }
}

void block_package2(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 partition, CGU_UINT32 threadBase)
{
    block.x = 0x04 | ((partition - 64) << 3) | ((get_end_point_l(0).r & 0xF8) << 6) | ((get_end_point_h(0).r & 0xF8) << 11) |
              ((get_end_point_l(1).r & 0xF8) << 16) | ((get_end_point_h(1).r & 0xF8) << 21) | ((get_end_point_l(2).r & 0xF8) << 26);
    block.y = ((get_end_point_l(2).r & 0xF8) >> 6) | ((get_end_point_h(2).r & 0xF8) >> 1) | ((get_end_point_l(0).g & 0xF8) << 4) |
              ((get_end_point_h(0).g & 0xF8) << 9) | ((get_end_point_l(1).g & 0xF8) << 14) | ((get_end_point_h(1).g & 0xF8) << 19) |
              ((get_end_point_l(2).g & 0xF8) << 24);
    block.z = ((get_end_point_h(2).g & 0xF8) >> 3) | ((get_end_point_l(0).b & 0xF8) << 2) | ((get_end_point_h(0).b & 0xF8) << 7) |
              ((get_end_point_l(1).b & 0xF8) << 12) | ((get_end_point_h(1).b & 0xF8) << 17) | ((get_end_point_l(2).b & 0xF8) << 22) |
              ((get_end_point_h(2).b & 0xF8) << 27);
    block.w      = ((get_end_point_h(2).b & 0xF8) >> 5) | (get_color_index(0) << 3);
    CGU_UINT32 i = 1;
    for (; i <= candidateFixUpIndex1DOrdered[partition][0]; i++)
    {
        block.w |= get_color_index(i) << (i * 2 + 2);
    }
    for (; i <= candidateFixUpIndex1DOrdered[partition][1]; i++)
    {
        block.w |= get_color_index(i) << (i * 2 + 1);
    }
    for (; i < 16; i++)
    {
        block.w |= get_color_index(i) << (i * 2);
    }
}

void block_package3(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 partition, CGU_UINT32 threadBase)
{
    block.x = 0x08 | (partition << 4) | ((get_end_point_l(0).r & 0xFE) << 9) | ((get_end_point_h(0).r & 0xFE) << 16) | ((get_end_point_l(1).r & 0xFE) << 23) |
              ((get_end_point_h(1).r & 0xFE) << 30);
    block.y = ((get_end_point_h(1).r & 0xFE) >> 2) | ((get_end_point_l(0).g & 0xFE) << 5) | ((get_end_point_h(0).g & 0xFE) << 12) |
              ((get_end_point_l(1).g & 0xFE) << 19) | ((get_end_point_h(1).g & 0xFE) << 26);
    block.z = ((get_end_point_h(1).g & 0xFE) >> 6) | ((get_end_point_l(0).b & 0xFE) << 1) | ((get_end_point_h(0).b & 0xFE) << 8) |
              ((get_end_point_l(1).b & 0xFE) << 15) | ((get_end_point_h(1).b & 0xFE) << 22) | ((get_end_point_l(0).r & 0x01) << 30) |
              ((get_end_point_h(0).r & 0x01) << 31);
    block.w = ((get_end_point_l(1).r & 0x01) << 0) | ((get_end_point_h(1).r & 0x01) << 1) | (get_color_index(0) << 2);

    CGU_UINT32 i = 1;
    for (; i <= candidateFixUpIndex1DOrdered[partition][0]; i++)
    {
        block.w |= get_color_index(i) << (i * 2 + 1);
    }
    for (; i < 16; i++)
    {
        block.w |= get_color_index(i) << (i * 2);
    }
}

void block_package4(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 rotation, CGU_UINT32 index_selector, CGU_UINT32 threadBase)
{
    block.x = 0x10 | ((rotation & 3) << 5) | ((index_selector & 1) << 7) | ((get_end_point_l(0).r & 0xF8) << 5) | ((get_end_point_h(0).r & 0xF8) << 10) |
              ((get_end_point_l(0).g & 0xF8) << 15) | ((get_end_point_h(0).g & 0xF8) << 20) | ((get_end_point_l(0).b & 0xF8) << 25);

    block.y = ((get_end_point_l(0).b & 0xF8) >> 7) | ((get_end_point_h(0).b & 0xF8) >> 2) | ((get_end_point_l(0).a & 0xFC) << 4) |
              ((get_end_point_h(0).a & 0xFC) << 10) | ((get_color_index(0) & 1) << 18) | (get_color_index(1) << 19) | (get_color_index(2) << 21) |
              (get_color_index(3) << 23) | (get_color_index(4) << 25) | (get_color_index(5) << 27) | (get_color_index(6) << 29) | (get_color_index(7) << 31);

    block.z = (get_color_index(7) >> 1) | (get_color_index(8) << 1) | (get_color_index(9) << 3) | (get_color_index(10) << 5) | (get_color_index(11) << 7) |
              (get_color_index(12) << 9) | (get_color_index(13) << 11) | (get_color_index(14) << 13) | (get_color_index(15) << 15) |
              ((get_alpha_index(0) & 3) << 17) | (get_alpha_index(1) << 19) | (get_alpha_index(2) << 22) | (get_alpha_index(3) << 25) |
              (get_alpha_index(4) << 28) | (get_alpha_index(5) << 31);

    block.w = (get_alpha_index(5) >> 1) | (get_alpha_index(6) << 2) | (get_alpha_index(7) << 5) | (get_alpha_index(8) << 8) | (get_alpha_index(9) << 11) |
              (get_alpha_index(10) << 14) | (get_alpha_index(11) << 17) | (get_alpha_index(12) << 20) | (get_alpha_index(13) << 23) |
              (get_alpha_index(14) << 26) | (get_alpha_index(15) << 29);
}

void block_package5(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 rotation, CGU_UINT32 threadBase)
{
    block.x = 0x20 | (rotation << 6) | ((get_end_point_l(0).r & 0xFE) << 7) | ((get_end_point_h(0).r & 0xFE) << 14) | ((get_end_point_l(0).g & 0xFE) << 21) |
              ((get_end_point_h(0).g & 0xFE) << 28);
    block.y = ((get_end_point_h(0).g & 0xFE) >> 4) | ((get_end_point_l(0).b & 0xFE) << 3) | ((get_end_point_h(0).b & 0xFE) << 10) |
              (get_end_point_l(0).a << 18) | (get_end_point_h(0).a << 26);
    block.z = (get_end_point_h(0).a >> 6) | (get_color_index(0) << 2) | (get_color_index(1) << 3) | (get_color_index(2) << 5) | (get_color_index(3) << 7) |
              (get_color_index(4) << 9) | (get_color_index(5) << 11) | (get_color_index(6) << 13) | (get_color_index(7) << 15) | (get_color_index(8) << 17) |
              (get_color_index(9) << 19) | (get_color_index(10) << 21) | (get_color_index(11) << 23) | (get_color_index(12) << 25) |
              (get_color_index(13) << 27) | (get_color_index(14) << 29) | (get_color_index(15) << 31);
    block.w = (get_color_index(15) >> 1) | (get_alpha_index(0) << 1) | (get_alpha_index(1) << 2) | (get_alpha_index(2) << 4) | (get_alpha_index(3) << 6) |
              (get_alpha_index(4) << 8) | (get_alpha_index(5) << 10) | (get_alpha_index(6) << 12) | (get_alpha_index(7) << 14) | (get_alpha_index(8) << 16) |
              (get_alpha_index(9) << 18) | (get_alpha_index(10) << 20) | (get_alpha_index(11) << 22) | (get_alpha_index(12) << 24) |
              (get_alpha_index(13) << 26) | (get_alpha_index(14) << 28) | (get_alpha_index(15) << 30);
}

void block_package6(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 threadBase)
{
    block.x = 0x40 | ((get_end_point_l(0).r & 0xFE) << 6) | ((get_end_point_h(0).r & 0xFE) << 13) | ((get_end_point_l(0).g & 0xFE) << 20) |
              ((get_end_point_h(0).g & 0xFE) << 27);
    block.y = ((get_end_point_h(0).g & 0xFE) >> 5) | ((get_end_point_l(0).b & 0xFE) << 2) | ((get_end_point_h(0).b & 0xFE) << 9) |
              ((get_end_point_l(0).a & 0xFE) << 16) | ((get_end_point_h(0).a & 0xFE) << 23) | (get_end_point_l(0).r & 0x01) << 31;
    block.z = (get_end_point_h(0).r & 0x01) | (get_color_index(0) << 1) | (get_color_index(1) << 4) | (get_color_index(2) << 8) | (get_color_index(3) << 12) |
              (get_color_index(4) << 16) | (get_color_index(5) << 20) | (get_color_index(6) << 24) | (get_color_index(7) << 28);
    block.w = (get_color_index(8) << 0) | (get_color_index(9) << 4) | (get_color_index(10) << 8) | (get_color_index(11) << 12) | (get_color_index(12) << 16) |
              (get_color_index(13) << 20) | (get_color_index(14) << 24) | (get_color_index(15) << 28);
}

void block_package7(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 partition, CGU_UINT32 threadBase)
{
    block.x = 0x80 | (partition << 8) | ((get_end_point_l(0).r & 0xF8) << 11) | ((get_end_point_h(0).r & 0xF8) << 16) | ((get_end_point_l(1).r & 0xF8) << 21) |
              ((get_end_point_h(1).r & 0xF8) << 26);
    block.y = ((get_end_point_h(1).r & 0xF8) >> 6) | ((get_end_point_l(0).g & 0xF8) >> 1) | ((get_end_point_h(0).g & 0xF8) << 4) |
              ((get_end_point_l(1).g & 0xF8) << 9) | ((get_end_point_h(1).g & 0xF8) << 14) | ((get_end_point_l(0).b & 0xF8) << 19) |
              ((get_end_point_h(0).b & 0xF8) << 24);
    block.z = ((get_end_point_l(1).b & 0xF8) >> 3) | ((get_end_point_h(1).b & 0xF8) << 2) | ((get_end_point_l(0).a & 0xF8) << 7) |
              ((get_end_point_h(0).a & 0xF8) << 12) | ((get_end_point_l(1).a & 0xF8) << 17) | ((get_end_point_h(1).a & 0xF8) << 22) |
              ((get_end_point_l(0).r & 0x04) << 28) | ((get_end_point_h(0).r & 0x04) << 29);
    block.w = ((get_end_point_l(1).r & 0x04) >> 2) | ((get_end_point_h(1).r & 0x04) >> 1) | (get_color_index(0) << 2);

    CGU_UINT32 i = 1;
    for (; i <= candidateFixUpIndex1DOrdered[partition][0]; i++)
    {
        block.w |= get_color_index(i) << (i * 2 + 1);
    }
    for (; i < 16; i++)
    {
        block.w |= get_color_index(i) << (i * 2);
    }
}

void GroupSync()
{
#ifdef ASPM_GPU
    GroupMemoryBarrierWithGroupSync();
#endif
}

void set_pixel_rotation(CMP_INOUT CGU_Vec4ui CMP_REFINOUT pixel, CGU_UINT32 rotation)
{
#ifdef ASPM_GPU
    if (1 == rotation)
    {
        pixel.ra = pixel.ar;
    }
    else if (2 == rotation)
    {
        pixel.ga = pixel.ag;
    }
    else if (3 == rotation)
    {
        pixel.ba = pixel.ab;
    }
#else
    CGU_UINT32 r, g, b, a;
    r = pixel.r;
    g = pixel.g;
    b = pixel.b;
    a = pixel.a;

    if (1 == rotation)
    {
        pixel.r = a;
        pixel.a = r;
    }
    else if (2 == rotation)
    {
        pixel.g = a;
        pixel.a = g;
    }
    else if (3 == rotation)
    {
        pixel.b = a;
        pixel.a = b;
    }
#endif
}

CGU_BOOL cmp_ImageHasAlpha(CGU_UINT32 threadBase)
{
#if defined(ENABLED_MODE6) || defined(ENABLE_CMP_MODE6)
    CGU_UINT32 alpha;
    for (CGU_INT ii = 0; ii < 16; ii++)
    {
        alpha = shared_temp[threadBase + ii].pixel.a;
        if ((alpha < 255))
            return true;
    }
#endif
    return false;
}

#ifdef ENABLE_CMP_API

CGU_UINT32 GetRamp2(CGU_UINT32 e0, CGU_UINT32 e1, CGU_UINT32 index, CGU_UINT32 indexprecision)
{
    if (indexprecision == 2)
        return (CGU_UINT32)(((64 - aWeight[2][index]) * e0 + aWeight[2][index] * e1 + 32) >> 6);
    else if (indexprecision == 3)
        return (CGU_UINT32)(((64 - aWeight[1][index]) * e0 + aWeight[1][index] * e1 + 32) >> 6);
    else  // indexprecision == 4
        return (CGU_UINT32)(((64 - aWeight[0][index]) * e0 + aWeight[0][index] * e1 + 32) >> 6);
}

//====================================== MODE 6 ==========================================
void cmp_encode_apply_swap(CMP_INOUT CGU_Vec4ui epo_code_out[2], CMP_INOUT CGU_UINT32 block_index[2], CMP_IN CGU_INT bits)
{
    CGU_UINT32 levels = 1 << bits;
    if ((block_index[0] & 15) >= levels / 2)
    {
        // swap end points
        CGU_Vec4ui t    = epo_code_out[0];
        epo_code_out[0] = epo_code_out[1];
        epo_code_out[1] = t;

        block_index[0] = (CGU_UINT32)(0x11111111 * (levels - 1)) - block_index[0];
        block_index[1] = (CGU_UINT32)(0x11111111 * (levels - 1)) - block_index[1];
    }
}

CGU_INT cmp_Write32Bit(CMP_INOUT CGU_UINT32 base[4], CMP_IN CGU_INT offset, CMP_IN CGU_INT bits, CMP_IN CGU_UINT32 bitVal)
{
    base[offset / 32] |= ((CGU_UINT32)bitVal) << (offset % 32);
    if (offset % 32 + bits > 32)
    {
        if ((offset / 32 + 1) < 4)
            base[(offset / 32) + 1] |= cmp_shift_right_uint32(bitVal, 32 - offset % 32);
    }
    offset += bits;
    return offset;
}

void cmp_encode_index2(CMP_INOUT CGU_UINT32 data[4], CMP_IN CGU_INT pPos, CMP_INOUT CGU_UINT32 color_index[2], CMP_IN CGU_INT bits, CMP_IN CGU_INT flips)
{
    CGU_INT levels        = 1 << bits;
    CGU_INT flips_shifted = flips;
    for (CGU_INT k1 = 0; k1 < 2; k1++)
    {
        CGU_UINT32 qbits_shifted = color_index[k1];
        for (CGU_INT k2 = 0; k2 < 8; k2++)
        {
            CGU_UINT32 q = qbits_shifted & 15;
            if ((flips_shifted & 1) > 0)
                q = (levels - 1) - q;

            if (k1 == 0 && k2 == 0)
                pPos = cmp_Write32Bit(data, pPos, bits - 1, q);
            else
                pPos = cmp_Write32Bit(data, pPos, bits, q);
            qbits_shifted >>= 4;
            flips_shifted >>= 1;
        }
    }
}

void cmp_eigen_vector(CMP_INOUT CGV_Vec4f CMP_REFINOUT eigen_vector,
                      CMP_INOUT CGU_Vec4f CMP_REFINOUT image_mean,
                      CMP_IN CGV_Vec4ui                image_src[16],
                      CMP_IN CGU_INT                   numEntries)
{
    CGU_INT k;
    image_mean   = 0.0f;
    eigen_vector = 0.0f;
    CGV_FLOAT vector_covOut[10];
    CGV_FLOAT covar[10] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
    CGV_Vec4f rgbasum   = {0.0f, 0.0f, 0.0f, 0.0f};

    for (k = 0; k < numEntries; k++)
    {
        CGV_Vec4f rgba;
        rgba.x = image_src[k].x;
        rgba.y = image_src[k].y;
        rgba.z = image_src[k].z;
        rgba.w = image_src[k].w;

        rgbasum.x += rgba.x;
        rgbasum.y += rgba.y;
        rgbasum.z += rgba.z;
        rgbasum.w += rgba.w;

        covar[0] += rgba.x * rgba.x;  //covar[0].x => covar[0]
        covar[1] += rgba.x * rgba.y;  //covar[0].y => covar[1]
        covar[2] += rgba.x * rgba.z;  //covar[0].z => covar[2]
        covar[3] += rgba.x * rgba.w;  //covar[0].w => covar[3]
        covar[4] += rgba.y * rgba.y;  //covar[1].y => covar[4]
        covar[5] += rgba.y * rgba.z;  //covar[1].z => covar[5]
        covar[6] += rgba.y * rgba.w;  //covar[1].w => covar[6]
        covar[7] += rgba.z * rgba.z;  //covar[2].z => covar[7]
        covar[8] += rgba.z * rgba.w;  //covar[2].w => covar[8]
        covar[9] += rgba.w * rgba.w;  //covar[3].w => covar[9]
    }

    image_mean = rgbasum / (CGV_FLOAT)numEntries;

    vector_covOut[0] = covar[0] - (rgbasum.x * rgbasum.x / numEntries);
    vector_covOut[1] = covar[1] - (rgbasum.x * rgbasum.y / numEntries);
    vector_covOut[2] = covar[2] - (rgbasum.x * rgbasum.z / numEntries);
    vector_covOut[3] = covar[3] - (rgbasum.x * rgbasum.w / numEntries);
    vector_covOut[4] = covar[4] - (rgbasum.y * rgbasum.y / numEntries);
    vector_covOut[5] = covar[5] - (rgbasum.y * rgbasum.z / numEntries);
    vector_covOut[6] = covar[6] - (rgbasum.y * rgbasum.w / numEntries);
    vector_covOut[7] = covar[7] - (rgbasum.z * rgbasum.z / numEntries);
    vector_covOut[8] = covar[8] - (rgbasum.z * rgbasum.w / numEntries);
    vector_covOut[9] = covar[9] - (rgbasum.w * rgbasum.w / numEntries);

    CGV_FLOAT inv_var = 1.0 / (256 * 256);  // GPU multiply is faster 1.5258789062500000e-05
    for (k = 0; k < 10; k++)
    {
        vector_covOut[k] = vector_covOut[k] * inv_var;
    }

    // Compute eigen_vector
    CGV_Vec4f vec             = {1.0f, 1.0f, 1.0f, 1.0f};
    CGU_INT   powerIterations = 6;  // 4 not enough for HQ : can use quality to set ranges from 2..n

    for (k = 0; k < powerIterations; k++)
    {
        eigen_vector.x = vector_covOut[0] * vec.x + vector_covOut[1] * vec.y + vector_covOut[2] * vec.z + vector_covOut[3] * vec.w;
        eigen_vector.y = vector_covOut[1] * vec.x + vector_covOut[4] * vec.y + vector_covOut[5] * vec.z + vector_covOut[6] * vec.w;
        eigen_vector.z = vector_covOut[2] * vec.x + vector_covOut[5] * vec.y + vector_covOut[7] * vec.z + vector_covOut[8] * vec.w;
        eigen_vector.w = vector_covOut[3] * vec.x + vector_covOut[6] * vec.y + vector_covOut[8] * vec.z + vector_covOut[9] * vec.w;

        // renormalize every other iteration
        if (k % 2 == 1)
        {
            CGV_FLOAT norm_sq = cmp_dot4f(eigen_vector, eigen_vector);
            CGV_FLOAT rnorm   = cmp_Image_rsqrt(norm_sq);
            vec               = eigen_vector * rnorm;
        }
        else
            vec = eigen_vector;
    }

    eigen_vector = vec;

    //printf("eigen_vector [%1.8f,%1.3f,%1.8f,%1.8f]\n", eigen_vector.x, eigen_vector.y, eigen_vector.z, eigen_vector.w);
}

void cmp_endpoints2(CMP_INOUT CGU_Vec4ui end_points_out[2], CMP_IN CGV_Vec4f ext[2], CMP_IN CGV_Vec4f eigen_vector, CMP_IN CGV_Vec4f image_mean)
{
    CGV_FLOAT levelHigh = 255;  // Mode 6 levels = 1 << bits = 128   then use (level * 2) - 1
    CGV_FLOAT levelLow  = 254;  // Mode 6 levels = 1 << bits = 128   then use (level * 2) - 2
    CGV_Vec4f qep_b[2];
    CGV_FLOAT err0 = 0.0f;
    CGV_FLOAT err1 = 0.0f;
    CGV_Vec4f block_endpoints[2];

    block_endpoints[0] = ext[0] * eigen_vector + image_mean;
    block_endpoints[1] = ext[1] * eigen_vector + image_mean;

    for (CGU_INT subset = 0; subset < 2; subset++)
    {  // this code effects quality
        qep_b[0].x = cmp_clampf((CGV_INT)((block_endpoints[subset].x / 255.0f * levelHigh) / 2.0f + 0.5f) * 2.0f, 0, levelLow);
        qep_b[0].y = cmp_clampf((CGV_INT)((block_endpoints[subset].y / 255.0f * levelHigh) / 2.0f + 0.5f) * 2.0f, 0, levelLow);
        qep_b[0].z = cmp_clampf((CGV_INT)((block_endpoints[subset].z / 255.0f * levelHigh) / 2.0f + 0.5f) * 2.0f, 0, levelLow);
        qep_b[0].w = cmp_clampf((CGV_INT)((block_endpoints[subset].w / 255.0f * levelHigh) / 2.0f + 0.5f) * 2.0f, 0, levelLow);

        qep_b[1].x = cmp_clampf((CGV_INT)((block_endpoints[subset].x / 255.0f * levelHigh - 1) / 2.0f + 0.5f) * 2 + 1, 1, levelHigh);
        qep_b[1].y = cmp_clampf((CGV_INT)((block_endpoints[subset].y / 255.0f * levelHigh - 1) / 2.0f + 0.5f) * 2 + 1, 1, levelHigh);
        qep_b[1].z = cmp_clampf((CGV_INT)((block_endpoints[subset].z / 255.0f * levelHigh - 1) / 2.0f + 0.5f) * 2 + 1, 1, levelHigh);
        qep_b[1].w = cmp_clampf((CGV_INT)((block_endpoints[subset].w / 255.0f * levelHigh - 1) / 2.0f + 0.5f) * 2 + 1, 1, levelHigh);

        err0 = cmp_dot4f(block_endpoints[subset] - qep_b[0], block_endpoints[subset] - qep_b[0]);
        err1 = cmp_dot4f(block_endpoints[subset] - qep_b[1], block_endpoints[subset] - qep_b[1]);
        if (subset == 0)
        {
            end_points_out[1].x = (err0 < err1) ? qep_b[0].x : qep_b[1].x;
            end_points_out[1].y = (err0 < err1) ? qep_b[0].y : qep_b[1].y;
            end_points_out[1].z = (err0 < err1) ? qep_b[0].z : qep_b[1].z;
            end_points_out[1].w = (err0 < err1) ? qep_b[0].w : qep_b[1].w;
        }
        else
        {
            end_points_out[0].x = ((err0 < err1) ? qep_b[0].x : qep_b[1].x);
            end_points_out[0].y = ((err0 < err1) ? qep_b[0].y : qep_b[1].y);
            end_points_out[0].z = ((err0 < err1) ? qep_b[0].z : qep_b[1].z);
            end_points_out[0].w = ((err0 < err1) ? qep_b[0].w : qep_b[1].w);
        }
    }
}

void cmp_block_endpoints(CMP_INOUT CGU_Vec4ui end_points_out[2],
                         CMP_IN CGV_Vec4f     eigen_vector,
                         CMP_IN CGV_Vec4f     image_mean,
                         CMP_IN CGU_Vec4ui    image_src[16],
                         CMP_IN CGU_INT       numEntries,     //IN: range 0..15 (MAX_SUBSET_SIZE)
                         CMP_IN CGU_INT       partition_mask  // 0xFFFF:FFFF
)
{
    CGV_Vec4f ext[2] = {{255.0f, 255.0f, 255.0f, 255.0f}, {0.0f, 0.0f, 0.0f, 0.0f}};

    // find min/max
    CGV_INT mask_shifted = partition_mask << 1;
    for (CGU_INT k3 = 0; k3 <= numEntries; k3++)
    {
        mask_shifted >>= 1;
        if ((mask_shifted & 1) == 0)
            continue;

        CGV_FLOAT dot = 0;
        CGV_Vec4f diff;
        diff.x = image_src[k3].x - image_mean.x;
        diff.y = image_src[k3].y - image_mean.y;
        diff.z = image_src[k3].z - image_mean.z;
        diff.w = image_src[k3].w - image_mean.w;

        dot += cmp_dot4f(eigen_vector, diff);

        ext[0].x = cmp_minf(ext[0].x, dot);
        ext[0].y = cmp_minf(ext[0].y, dot);
        ext[0].z = cmp_minf(ext[0].z, dot);
        ext[0].w = cmp_minf(ext[0].w, dot);

        ext[1].x = cmp_maxf(ext[1].x, dot);
        ext[1].y = cmp_maxf(ext[1].y, dot);
        ext[1].z = cmp_maxf(ext[1].z, dot);
        ext[1].w = cmp_maxf(ext[1].w, dot);
    }

    // create some distance if the endpoints collapse
    if (ext[1].x - ext[0].x < 1.0f)
    {
        ext[0] -= 0.5f;
        ext[1] += 0.5f;
    }

    cmp_endpoints2(end_points_out, ext, eigen_vector, image_mean);
}

CGV_UINT8 clampIndex2(CGV_UINT8 v, CGV_UINT8 a, CGV_UINT8 b)
{
    if (v < a)
        return a;
    else if (v > b)
        return b;
    return v;
}

void cmp_block_index(CMP_INOUT CGU_UINT32 index_out[16],
                     CMP_IN CGV_Vec4f     eigen_vector,
                     CMP_IN CGV_Vec4f     image_mean,
                     CMP_IN CGU_Vec4ui    image_src[16],
                     CMP_IN CGU_UINT32    numEntries  // Range 0..15 (MAX_SUBSET_SIZE)
)
{
    //=====================
    // Get Projected Index
    //=====================
    CGV_FLOAT image_projected[16];

    CGV_FLOAT image_v[16];
    CGV_FLOAT image_z[16];
    CGV_FLOAT projected_high;  // Values are +ve about centered image projection
    CGV_FLOAT projected_low;   // Values are -ve about centered image projection
    CGV_FLOAT image_s;

    //====================================================================
    // Center the image to new coordinate axis centered at the mean value
    //====================================================================
    CGV_Vec4f image_centered[16];
    CGV_Vec4f diff;
    for (CGU_UINT32 k1 = 0; k1 <= numEntries; k1++)
    {
        diff.x              = image_src[k1].x - image_mean.x;
        diff.y              = image_src[k1].y - image_mean.y;
        diff.z              = image_src[k1].z - image_mean.z;
        diff.w              = image_src[k1].w - image_mean.w;
        image_centered[k1]  = diff * eigen_vector;
        image_projected[k1] = image_centered[k1].x + image_centered[k1].y + image_centered[k1].z + image_centered[k1].w;
    }

    projected_high = image_projected[0];
    projected_low  = image_projected[0];

    for (CGU_UINT32 i1 = 1; i1 <= numEntries; i1++)
    {
        if (projected_high < image_projected[i1])
            projected_high = image_projected[i1];
        if (projected_low > image_projected[i1])
            projected_low = image_projected[i1];
    }

    CGV_FLOAT img_diff = projected_low - projected_high;

    if (img_diff == 0.0f)
        return;

    image_s = numEntries / img_diff;

    // Get initial index projection
    for (CGU_UINT32 idx = 0; idx <= numEntries; idx++)
    {
        image_v[idx]   = image_projected[idx] * image_s;
        image_z[idx]   = cmp_floor(image_v[idx] + 0.5F - projected_high * image_s);
        index_out[idx] = (CGV_UINT32)image_z[idx];
    }

    // get minimum index
    CGU_UINT32 index_min = index_out[0];
    for (CGU_UINT32 i3 = 1; i3 <= numEntries; i3++)
    {
        if (index_out[i3] < index_min)
            index_min = index_out[i3];
    }

    // Reposition all index by min index (using min index as 0)
    //printf("index : ");
    for (CGU_UINT32 i4 = 0; i4 <= numEntries; i4++)
    {
        index_out[i4] = clampIndex2(index_out[i4] - index_min, 0, 15);
        //printf("%02x,", index_out[i4]);
    }
    //printf("\n");
}

CGU_UINT32 cmp_calcblockerr(CGU_Vec4ui endPoint_in[2], CGU_Vec4ui image_src[16])
{
    CGU_UINT32 error = 0;
    CGU_Vec4ui pixel = image_src[0];
    CGU_Vec4ui endPoint[2];
    CGU_Vec4i  pixelDiff;

    endPoint[0] = endPoint_in[0];
    endPoint[1] = endPoint_in[1];
    pixelDiff.x = pixel.x - endPoint[0].x;
    pixelDiff.y = pixel.y - endPoint[0].y;
    pixelDiff.z = pixel.z - endPoint[0].z;
    pixelDiff.w = pixel.w - endPoint[0].w;

    CGU_Vec4i span;
    CGU_Vec2i span_norm_sqr;
    CGU_Vec2i dotProduct;

    span.x = endPoint[1].x - endPoint[0].x;
    span.y = endPoint[1].y - endPoint[0].y;
    span.z = endPoint[1].z - endPoint[0].z;
    span.w = endPoint[1].w - endPoint[0].w;

    span_norm_sqr = cmp_dotVec4i(span, span);
    dotProduct    = cmp_dotVec4i(span, pixelDiff);
    if (span_norm_sqr.x > 0 && dotProduct.x >= 0 && CGU_UINT32(dotProduct.x * 63.49999) > CGU_UINT32(32 * span_norm_sqr.x))
    {
        span.x = -span.x;
        span.y = -span.y;
        span.z = -span.z;
        span.w = -span.w;
        swap(endPoint[0], endPoint[1]);
    }

    CGU_UINT32 color_index;
    CGU_Vec4ui pixel_r;
    for (CGU_UINT32 i = 0; i < 16; i++)
    {
        pixel = image_src[i];

        pixelDiff.x = pixel.x - endPoint[0].x;
        pixelDiff.y = pixel.y - endPoint[0].y;
        pixelDiff.z = pixel.z - endPoint[0].z;
        pixelDiff.w = pixel.w - endPoint[0].w;

        dotProduct.x = cmp_dotVec4i(span, pixelDiff);
        color_index  = (span_norm_sqr.x <= 0 || dotProduct.x <= 0)
                           ? 0
                           : ((dotProduct.x < span_norm_sqr.x) ? aStep[0][CGU_UINT32(dotProduct.x * 63.49999 / span_norm_sqr.x)] : aStep[0][63]);

        pixel_r = (endPoint[0] * (64 - aWeight[0][color_index]) + endPoint[1] * aWeight[0][color_index] + 32u) >> 6;

        Ensure_A_Is_Larger(pixel_r, pixel);
        pixel_r -= pixel;
        error += ComputeError(pixel_r, pixel_r);
    }

    return error;
}

CGU_FLOAT cmp_GetIndexedEndPoints(CMP_INOUT CGU_Vec4ui epo_code_out[2],
                                  CMP_INOUT CGU_UINT32 index_out[16],
                                  CMP_IN CGU_Vec4ui    image_src[16],
                                  CMP_IN CGU_INT       numEntries,
                                  CMP_IN CGU_INT       partition_mask)
{
    CGV_Vec4f image_mean = {0.0f, 0.0f, 0.0f, 0.0f};
    CGV_Vec4f eigen_vector;

    for (CGU_INT i0 = 0; i0 < 16; i0++)
        index_out[i0] = 0;

    cmp_eigen_vector(eigen_vector, image_mean, image_src, numEntries);

    cmp_block_endpoints(epo_code_out, eigen_vector, image_mean, image_src, numEntries, partition_mask);

    cmp_block_index(index_out, eigen_vector, image_mean, image_src, numEntries);

    CGU_UINT32 besterr = cmp_calcblockerr(epo_code_out, image_src);

    return besterr;
}

void cmp_encode_mode6(CMP_INOUT CGU_UINT32 cmp_out[4], CMP_IN CGU_Vec4ui epo_code_out[2], CMP_IN CGU_UINT32 packed_color_index[2])
{
    cmp_encode_apply_swap(epo_code_out, packed_color_index, 4);
    CGU_INT k;

    for (k = 0; k < 4; k++)
        cmp_out[k] = 0;

    CGU_INT pos = 0;

    // mode 6
    pos = cmp_Write32Bit(cmp_out, pos, 7, 64);

    // endpoints
    pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[0].x >> 1);
    pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[1].x >> 1);
    pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[0].y >> 1);
    pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[1].y >> 1);
    pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[0].z >> 1);
    pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[1].z >> 1);
    pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[0].w >> 1);
    pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[1].w >> 1);

    // p bits
    pos = cmp_Write32Bit(cmp_out, pos, 1, epo_code_out[0].x & 1);
    pos = cmp_Write32Bit(cmp_out, pos, 1, epo_code_out[1].x & 1);

    // quantized values
    cmp_encode_index2(cmp_out, pos, packed_color_index, 4, 0);
}

//====================================== MODES 01237 ==========================================
CGU_UINT32 index_collapse2(CMP_INOUT CGU_UINT32 index[16], CGU_UINT32 numEntries)
{
    CGU_UINT32 minIndex = index[0];
    CGU_UINT32 MaxIndex = index[0];

    for (CGU_UINT32 km = 1; km < numEntries; km++)
    {
        if (index[km] < minIndex)
            minIndex = index[km];
        if (index[km] > MaxIndex)
            MaxIndex = index[km];
    }

    if (MaxIndex == 0)
        return 0;

    CGU_UINT32 D = 1;

    for (CGU_UINT32 d = 2; d <= MaxIndex - minIndex; d++)
    {
        for (CGU_UINT32 ent = 0U; ent < numEntries; ent++)
        {
            CGU_UINT8 imod = (index[ent] - minIndex);
            if (fmod(imod, d) > 0.0f)
            {
                if (ent >= numEntries)
                    D = d;
                break;
            }
        }
    }

    CGU_FLOAT invD = 1.0f / D;
    for (CGU_UINT32 ki = 0; ki < numEntries; ki++)
    {
        index[ki] = (CGU_UINT32)((index[ki] - minIndex) * invD);
    }

    for (CGU_UINT32 k = 1; k < numEntries; k++)
    {
        if (index[k] > MaxIndex)
            MaxIndex = index[k];
    }

    return (MaxIndex);
}

INLINE void GetClusterMean2(CMP_INOUT CGV_Vec4f image_cluster_mean[16],
                            CMP_IN CGU_Vec4ui   image_src[16],
                            CMP_IN CGU_UINT32   index_cluster[16],
                            CMP_IN CGU_UINT32   numEntries,  // < 16
                            CMP_IN CGU_UINT32   channels3or4)
{  // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
    // unused index values are underfined
    CGU_UINT32 i_cnt[16];
    CGU_UINT32 i_comp[16];
    CGU_UINT32 idx;

    for (CGU_UINT32 i0 = 0; i0 < numEntries; i0++)
    {
        idx                     = index_cluster[i0] & 0x0F;
        i_cnt[idx]              = 0;
        image_cluster_mean[idx] = 0.0f;
    }

    CGU_UINT32 ic = 0;
    for (CGU_UINT32 i1 = 0; i1 < numEntries; i1++)
    {
        idx = index_cluster[i1] & 0x0F;
        if (i_cnt[idx] == 0)
            i_comp[ic++] = idx;
        i_cnt[idx]++;

        image_cluster_mean[idx].x += image_src[i1].x;
        image_cluster_mean[idx].y += image_src[i1].y;
        image_cluster_mean[idx].z += image_src[i1].z;
        image_cluster_mean[idx].w += image_src[i1].w;
    }

    for (CGU_UINT32 i = 0; i < ic; i++)
    {
        CGU_UINT32 icmp = i_comp[i];
        if (i_cnt[icmp] != 0)
        {
            image_cluster_mean[icmp].x = (CGV_FLOAT)cmp_floor((image_cluster_mean[icmp].x / (CGV_FLOAT)i_cnt[icmp]) + 0.5F);
            image_cluster_mean[icmp].y = (CGV_FLOAT)cmp_floor((image_cluster_mean[icmp].y / (CGV_FLOAT)i_cnt[icmp]) + 0.5F);
            image_cluster_mean[icmp].z = (CGV_FLOAT)cmp_floor((image_cluster_mean[icmp].z / (CGV_FLOAT)i_cnt[icmp]) + 0.5F);
            if (channels3or4 == 4)
                image_cluster_mean[icmp].w = (CGV_FLOAT)cmp_floor((image_cluster_mean[icmp].w / (CGV_FLOAT)i_cnt[icmp]) + 0.5F);
            else
                image_cluster_mean[icmp].w = 0.0f;
        }
    }
}

#ifndef ASPM_HLSL  // CPU Version

#define USE_OLDCODE

INLINE CGU_UINT8 cmp_get_partition_subset2(CMP_IN CGU_INT part_id, CMP_IN CGU_INT maxSubsets, CMP_IN CGU_INT index)
{
    if (maxSubsets == 2)
    {
        CGU_UINT32 mask_packed = subset_mask_table2[part_id];
        return ((mask_packed & (0x01 << index)) ? 1 : 0);  // This can be moved to caller, just return mask!!
    }

    // 3 region subsets
    part_id += 64;
    CGU_UINT32 mask0 = subset_mask_table2[part_id] & 0xFFFF;
    CGU_UINT32 mask1 = subset_mask_table2[part_id] >> 16;
    CGU_UINT32 mask  = 0x01 << index;

    return ((mask1 & mask) ? 2 : 0 + (mask0 & mask) ? 1 : 0);  // This can be moved to caller, just return mask!!
}

void cmp_GetPartitionSubSet2_mode01237(CMP_INOUT CGV_Vec4ui image_subsets[3][16],  // OUT: Subset pattern mapped with image src colors
                                       CMP_INOUT CGU_INT    entryCount_out[3],     // OUT: Number of entries per subset
                                       CMP_IN CGU_UINT8     partition,             // Partition Shape 0..63
                                       CMP_IN CGV_Vec4ui    image_src[16],         // Image colors
                                       CMP_IN CGU_INT       blockMode,             // [0,1,2,3 or 7]
                                       CMP_IN CGU_UINT8     channels3or4)
{  // 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
    CGU_UINT8 maxSubsets = 2;
    if (blockMode == 0 || blockMode == 2)
        maxSubsets = 3;

    entryCount_out[0] = 0;
    entryCount_out[1] = 0;
    entryCount_out[2] = 0;

    for (CGU_INT i = 0; i < 16; i++)
    {
        CGU_UINT8 subset = cmp_get_partition_subset2(partition, maxSubsets, i);

        image_subsets[subset][entryCount_out[subset]].x = image_src[i].x;
        image_subsets[subset][entryCount_out[subset]].y = image_src[i].y;
        image_subsets[subset][entryCount_out[subset]].z = image_src[i].z;

        // if we have only 3 channels then set the alpha subset to 0
        if (channels3or4 == 3)
            image_subsets[subset][entryCount_out[subset]].w = 0.0F;
        else
            image_subsets[subset][entryCount_out[subset]].w = image_src[i].w;
        entryCount_out[subset]++;
    }
}

void cmp_GetImageCentered(CMP_INOUT CGV_Vec4f              image_centered[16],
                          CMP_INOUT CGV_Vec4f CMP_REFINOUT mean_out,
                          CMP_IN CGV_Vec4ui                image_src[16],
                          CMP_IN CGU_INT                   numEntries,
                          CMP_IN CGU_UINT8                 channels3or4)
{
    (channels3or4);

    mean_out = 0.0f;
    CGU_INT k;

    for (k = 0; k < numEntries; k++)
    {
        mean_out.x = mean_out.x + image_src[k].x;
        mean_out.y = mean_out.y + image_src[k].y;
        mean_out.z = mean_out.z + image_src[k].z;
        if (channels3or4 == 4)
            mean_out.w = mean_out.w + image_src[k].w;
    }

    mean_out /= (CGV_FLOAT)numEntries;

    for (k = 0; k < numEntries; k++)
    {
        image_centered[k].x = image_src[k].x - mean_out.x;
        image_centered[k].y = image_src[k].y - mean_out.y;
        image_centered[k].z = image_src[k].z - mean_out.z;
        if (channels3or4 == 4)
            image_centered[k].w = image_src[k].w - mean_out.w;
    }
}

void cmp_GetCovarianceVector(CMP_INOUT CGV_FLOAT covariance_out[16],
                             CMP_IN CGV_Vec4f    image_centered[16],
                             CMP_IN CGU_INT      numEntries,
                             CMP_IN CGU_UINT8    channels3or4)
{
    CGU_UINT8 ch1;
    CGU_UINT8 ch2;
    CGU_INT   k;

    for (ch1 = 0; ch1 < channels3or4; ch1++)
        for (ch2 = 0; ch2 <= ch1; ch2++)
        {
            covariance_out[ch1 + ch2 * 4] = 0;
            for (k = 0; k < numEntries; k++)
                covariance_out[ch1 + ch2 * 4] += image_centered[k][ch1] * image_centered[k][ch2];
        }

    for (ch1 = 0; ch1 < channels3or4; ch1++)
        for (ch2 = ch1 + 1; ch2 < channels3or4; ch2++)
            covariance_out[ch1 + ch2 * 4] = covariance_out[ch2 + ch1 * 4];
}

void cmp_GetEigenVector(CMP_INOUT CGV_Vec4f CMP_REFINOUT EigenVector_out,       // Normalized Eigen Vector output
                        CMP_IN CGV_FLOAT                 CovarianceVector[16],  // Covariance Vector
                        CMP_IN CGU_UINT8                 channels3or4)
{
    CGV_FLOAT vector_covIn[16];
    CGV_FLOAT vector_covOut[16];
    CGV_FLOAT vector_maxCovariance;
    CGU_UINT8 ch1;
    CGU_UINT8 ch2;
    CGU_UINT8 ch3;

    for (ch1 = 0; ch1 < channels3or4; ch1++)
        for (ch2 = 0; ch2 < channels3or4; ch2++)
        {
            vector_covIn[ch1 + ch2 * 4] = CovarianceVector[ch1 + ch2 * 4];
        }

    vector_maxCovariance = 0;

    for (ch1 = 0; ch1 < channels3or4; ch1++)
    {
        if (vector_covIn[ch1 + ch1 * 4] > vector_maxCovariance)
            vector_maxCovariance = vector_covIn[ch1 + ch1 * 4];
    }

    // Normalize Input Covariance Vector
    for (ch1 = 0; ch1 < channels3or4; ch1++)
        for (ch2 = 0; ch2 < channels3or4; ch2++)
        {
            if (vector_maxCovariance > 0)
                vector_covIn[ch1 + ch2 * 4] = vector_covIn[ch1 + ch2 * 4] / vector_maxCovariance;
        }

    for (ch1 = 0; ch1 < channels3or4; ch1++)
    {
        for (ch2 = 0; ch2 < channels3or4; ch2++)
        {
            CGV_FLOAT vector_temp_cov = 0;
            for (ch3 = 0; ch3 < channels3or4; ch3++)
            {
                vector_temp_cov = vector_temp_cov + vector_covIn[ch1 + ch3 * 4] * vector_covIn[ch3 + ch2 * 4];
            }
            vector_covOut[ch1 + ch2 * 4] = vector_temp_cov;
        }
    }

    vector_maxCovariance = 0;

    CGU_INT maxCovariance_channel = 0;

    for (ch1 = 0; ch1 < channels3or4; ch1++)
    {
        if (vector_covOut[ch1 + ch1 * 4] > vector_maxCovariance)
        {
            maxCovariance_channel = ch1;
            vector_maxCovariance  = vector_covOut[ch1 + ch1 * 4];
        }
    }

    CGV_FLOAT vector_t = 0;

    for (ch1 = 0; ch1 < channels3or4; ch1++)
    {
        vector_t             = vector_t + vector_covOut[maxCovariance_channel + ch1 * 4] * vector_covOut[maxCovariance_channel + ch1 * 4];
        EigenVector_out[ch1] = vector_covOut[maxCovariance_channel + ch1 * 4];
    }

    // Normalize the Eigen Vector
    vector_t = sqrt(vector_t);
    for (ch1 = 0; ch1 < channels3or4; ch1++)
    {
        if (vector_t > 0)
            EigenVector_out[ch1] = EigenVector_out[ch1] / vector_t;
    }
}

void cmp_GetProjecedImage(CMP_INOUT CGV_FLOAT projection_out[16],
                          CMP_IN CGV_Vec4f    image_centered[16],
                          CMP_IN CGU_INT      numEntries,
                          CMP_IN CGV_Vec4f    EigenVector,
                          CMP_IN CGU_UINT8    channels3or4)
{
    // EigenVector must be normalized
    for (CGU_INT k = 0; k < numEntries; k++)
    {
        projection_out[k] = 0.0F;
        projection_out[k] = projection_out[k] + (image_centered[k].x * EigenVector.x);
        projection_out[k] = projection_out[k] + (image_centered[k].y * EigenVector.y);
        projection_out[k] = projection_out[k] + (image_centered[k].z * EigenVector.z);
        if (channels3or4 == 4)
            projection_out[k] = projection_out[k] + (image_centered[k].w * EigenVector.w);
    }
}

typedef struct
{
    CGV_FLOAT image;
    CGU_UINT8 index;
} CMP_di2;

void cmp_GetProjectedIndex(CMP_INOUT CGU_UINT8 projected_index_out[16],  //output: index, uncentered, in the range 0..clusters-1
                           CMP_IN CGV_FLOAT    image_projected[16],      // image_block points, might be uncentered
                           CMP_IN CGU_INT      clusters,                 // clusters: number of points in the ramp   (max 16)
                           CMP_IN CGU_INT      numEntries)
{
    CMP_di2   what[16];
    CGV_FLOAT image_v[16];
    CGV_FLOAT image_z[16];
    CGV_FLOAT image_l;
    CGV_FLOAT image_mm;
    CGV_FLOAT image_r  = 0.0F;
    CGV_FLOAT image_dm = 0.0F;
    CGV_FLOAT image_min;
    CGV_FLOAT image_max;
    CGV_FLOAT image_s;

    CGU_INT i;
    CGU_INT j;

    for (i = 0; i < 16; i++)
        projected_index_out[i] = 0;

    image_min = image_projected[0];
    image_max = image_projected[0];

    for (i = 1; i < numEntries; i++)
    {
        if (image_min < image_projected[i])
            image_min = image_projected[i];
        if (image_max > image_projected[i])
            image_max = image_projected[i];
    }

    CGV_FLOAT img_diff = image_max - image_min;

    if (img_diff == 0.0f)
        return;
    if (cmp_isnan(img_diff))
        return;

    image_s = (clusters - 1) / img_diff;

    for (i = 0; i < numEntries; i++)
    {
        image_v[i]             = image_projected[i] * image_s;
        image_z[i]             = cmp_floor(image_v[i] + 0.5F - image_min * image_s);
        projected_index_out[i] = (CGU_UINT8)image_z[i];

        what[i].image = image_v[i] - image_z[i] - image_min * image_s;
        what[i].index = i;
        image_dm += what[i].image;
        image_r += what[i].image * what[i].image;
    }

    if (numEntries * image_r - image_dm * image_dm >= (CGV_FLOAT)(numEntries - 1) / 8)
    {
        image_dm /= numEntries;

        for (i = 0; i < numEntries; i++)
            what[i].image -= image_dm;

        CGU_UINT8 tmp_index;
        CGV_FLOAT tmp_image;
        for (i = 1; i < numEntries; i++)
        {
            for (j = i; j > 0; j--)
            {
                if (what[j - 1].image > what[j].image)
                {
                    tmp_index         = what[j].index;
                    tmp_image         = what[j].image;
                    what[j].index     = what[j - 1].index;
                    what[j].image     = what[j - 1].image;
                    what[j - 1].index = tmp_index;
                    what[j - 1].image = tmp_image;
                }
            }
        }

        // got into fundamental simplex
        // move coordinate system origin to its center

        // i=0 < numEntries avoids varying int division by 0
        for (i = 0; i < numEntries; i++)
        {
            what[i].image = what[i].image - (CGV_FLOAT)(((2.0f * i + 1) - numEntries) / (2.0f * numEntries));
        }

        image_mm = 0.0F;
        image_l  = 0.0F;

        j = -1;
        for (i = 0; i < numEntries; i++)
        {
            image_l += what[i].image;
            if (image_l < image_mm)
            {
                image_mm = image_l;
                j        = i;
            }
        }

        j = j + 1;
        // avoid  j = j%numEntries use this
        while (j > numEntries)
            j = j - numEntries;

        for (i = j; i < numEntries; i++)
        {
            CGU_UINT8 idx            = what[i].index;
            CGU_UINT8 pidx           = projected_index_out[idx] + 1;  //gather_index(projected_index_out,idx)+1;
            projected_index_out[idx] = pidx;                          // scatter_index(projected_index_out,idx,pidx);
        }
    }

    // get minimum index
    CGU_UINT8 index_min = projected_index_out[0];
    for (i = 1; i < numEntries; i++)
    {
        if (projected_index_out[i] < index_min)
            index_min = projected_index_out[i];
    }

    // reposition all index by min index (using min index as 0)
    for (i = 0; i < numEntries; i++)
    {
        projected_index_out[i] = cmp_clampi(projected_index_out[i] - index_min, 0, 15);
    }
}

CGV_FLOAT cmp_err_Total(CMP_IN CGV_Vec4ui image_src1[16], CMP_IN CGV_Vec4f image_src2[16], CMP_IN CGU_INT numEntries, CMP_IN CGU_UINT8 channels3or4)
{
    CGV_FLOAT err_t = 0.0F;
    for (CGU_INT k = 0; k < numEntries; k++)
    {
        err_t = err_t + cmp_squaref(image_src1[k].x - image_src2[k].x);
        err_t = err_t + cmp_squaref(image_src1[k].y - image_src2[k].y);
        err_t = err_t + cmp_squaref(image_src1[k].z - image_src2[k].z);
        if (channels3or4 == 4)
            err_t = err_t + cmp_squaref(image_src1[k].w - image_src2[k].w);
    }
    return err_t;
};

CGV_FLOAT cmp_GetQuantizeIndex_old(CMP_INOUT CGU_UINT8 index_out[16],
                                   CMP_IN CGV_Vec4ui   image_src[16],
                                   CMP_IN CGU_INT      numEntries,
                                   CMP_IN CGU_INT      numClusters,
                                   CMP_IN CGU_UINT8    channels3or4)
{
    CGV_FLOAT covariance_vector[16];
    CGV_Vec4f image_centered[16];
    CGV_FLOAT image_projected[16];
    CGV_Vec4f image_mean   = 0.0f;
    CGV_Vec4f eigen_vector = 0.0f;

    // Init vars
    for (CGU_INT ik = 0; ik < 16; ik++)
    {
        covariance_vector[ik] = 0.0f;
        image_centered[ik]    = 0.0f;
        image_projected[ik]   = 0.0f;
    }

    cmp_GetImageCentered(image_centered, image_mean, image_src, numEntries, channels3or4);
    cmp_GetCovarianceVector(covariance_vector, image_centered, numEntries, channels3or4);

    //-----------------------------------------------------
    // check if all covariances are the same
    // if so then set all index to same value 0 and return
    // use EPSILON to set the limit for all same limit
    //-----------------------------------------------------

    CGV_FLOAT image_covt = 0.0F;

    image_covt = covariance_vector[0];
    image_covt = image_covt + covariance_vector[5];
    image_covt = image_covt + covariance_vector[10];
    if (channels3or4 == 4)
        image_covt = image_covt + covariance_vector[15];

    if (image_covt < 0.00390625f)
    {
        for (CGU_INT i = 0; i < 16; i++)
            index_out[i] = 0;
        return 0.0f;
    }

    cmp_GetEigenVector(eigen_vector, covariance_vector, channels3or4);

    cmp_GetProjecedImage(image_projected, image_centered, numEntries, eigen_vector, channels3or4);
    cmp_GetProjectedIndex(index_out, image_projected, numClusters, numEntries);

    //==========================================
    // Refine
    //==========================================
    CGV_FLOAT image_q = 0.0F;
    eigen_vector      = 0.0f;

    for (CGU_INT k = 0; k < numEntries; k++)
    {
        eigen_vector.x = eigen_vector.x + image_centered[k].x * index_out[k];
        eigen_vector.y = eigen_vector.y + image_centered[k].y * index_out[k];
        eigen_vector.z = eigen_vector.z + image_centered[k].z * index_out[k];
        if (channels3or4 == 4)
            eigen_vector.w = eigen_vector.w + image_centered[k].w * index_out[k];
    }

    image_q = image_q + eigen_vector.x * eigen_vector.x;
    image_q = image_q + eigen_vector.y * eigen_vector.y;
    image_q = image_q + eigen_vector.z * eigen_vector.z;
    if (channels3or4 == 4)
        image_q = image_q + eigen_vector.w * eigen_vector.w;

    image_q = sqrt(image_q);

    // direction needs to be normalized
    if (image_q != 0.0F)
        eigen_vector = eigen_vector / image_q;

    // Get new projected data
    cmp_GetProjecedImage(image_projected, image_centered, numEntries, eigen_vector, channels3or4);
    cmp_GetProjectedIndex(index_out, image_projected, numClusters, numEntries);

    // Calc Error
    CGV_FLOAT image_t       = 0.0F;
    CGV_FLOAT index_average = 0.0F;

    for (CGU_INT ik = 0; ik < numEntries; ik++)
    {
        index_average = index_average + index_out[ik];
        image_t       = image_t + index_out[ik] * index_out[ik];
    }

    index_average = index_average / (CGV_FLOAT)numEntries;
    image_t       = image_t - index_average * index_average * (CGV_FLOAT)numEntries;

    if (image_t != 0.0F)
        image_t = 1.0F / image_t;

    eigen_vector = 0.0f;

    for (CGU_INT nk = 0; nk < numEntries; nk++)
    {
        eigen_vector.x = eigen_vector.x + image_centered[nk].x * index_out[nk];
        eigen_vector.y = eigen_vector.y + image_centered[nk].y * index_out[nk];
        eigen_vector.z = eigen_vector.z + image_centered[nk].z * index_out[nk];
        if (channels3or4 == 4)
            eigen_vector.w = eigen_vector.w + image_centered[nk].w * index_out[nk];
    }

    CGV_Vec4f image_decomp[SOURCE_BLOCK_SIZE];
    for (CGU_UINT32 ii = 0; ii < SOURCE_BLOCK_SIZE; ii++)
        image_decomp[ii] = 0.0f;

    for (CGU_INT i = 0; i < numEntries; i++)
    {
        image_decomp[i].x = image_mean.x + eigen_vector.x * image_t * (index_out[i] - index_average);
        image_decomp[i].y = image_mean.y + eigen_vector.y * image_t * (index_out[i] - index_average);
        image_decomp[i].z = image_mean.z + eigen_vector.z * image_t * (index_out[i] - index_average);
        if (channels3or4 == 4)
            image_decomp[i].w = image_mean.w + eigen_vector.w * image_t * (index_out[i] - index_average);
    }

    CGV_FLOAT err_1 = cmp_err_Total(image_src, image_decomp, numEntries, channels3or4);

    return err_1;
}

typedef struct
{
    CGV_FLOAT image;
    CGU_UINT8 index;
} CMP_du2;

void cmp_sortPartitionProjection(CMP_IN CGV_FLOAT projection[64], CMP_INOUT CGU_UINT8 order[64],
                                 CMP_IN CGU_UINT8 numPartitions)  // max 64
{
    CMP_du2   what[64];
    CGU_UINT8 Parti;
    CGU_UINT8 Partj;

    for (Parti = 0; Parti < numPartitions; Parti++)
    {
        what[Parti].index = Parti;
        what[Parti].image = projection[Parti];
    }

    CGU_UINT8 index;
    CGV_FLOAT data;

    for (Parti = 1; Parti < numPartitions; Parti++)
    {
        for (Partj = Parti; Partj > 0; Partj--)
        {
            if (what[Partj - 1].image > what[Partj].image)
            {
                index                 = what[Partj].index;
                data                  = what[Partj].image;
                what[Partj].index     = what[Partj - 1].index;
                what[Partj].image     = what[Partj - 1].image;
                what[Partj - 1].index = index;
                what[Partj - 1].image = data;
            }
        }
    }

    for (Parti = 0; Parti < numPartitions; Parti++)
        order[Parti] = what[Parti].index;
};

CGU_BOOL cmp_get_ideal_cluster(CMP_INOUT CGV_Vec4f image_cluster[2],
                               CMP_IN CGU_UINT32   index_cluster[16],
                               CMP_IN CGU_INT      Mi_,
                               CMP_IN CGV_Vec4ui   image_src[16],
                               CMP_IN CGU_INT      numEntries,
                               CMP_IN CGU_UINT8    channels3or4)
{
    // get ideal cluster centers
    CGV_Vec4f image_cluster_mean[16];

    for (CGU_INT ii = 0; ii < numEntries; ii++)
    {
        image_cluster_mean[ii] = 0.0f;
    }

    GetClusterMean2(image_cluster_mean, image_src, index_cluster, numEntries, channels3or4);  // unrounded

    CGV_FLOAT image_matrix0[2] = {0, 0};  // matrix /inverse matrix
    CGV_FLOAT image_matrix1[2] = {0, 0};  // matrix /inverse matrix
    CGV_Vec4f image_rp[2];                // right part for RMS fit problem

    image_rp[0] = 0.0f;
    image_rp[1] = 0.0f;

    // weight with cnt if runnning on compacted index
    for (CGU_INT k = 0; k < numEntries; k++)
    {
        image_matrix0[0] += (Mi_ - index_cluster[k]) * (Mi_ - index_cluster[k]);
        image_matrix0[1] += index_cluster[k] * (Mi_ - index_cluster[k]);  // im is symmetric
        image_matrix1[1] += index_cluster[k] * index_cluster[k];

        image_rp[0] += image_cluster_mean[index_cluster[k]] * (Mi_ - index_cluster[k]);
        image_rp[1] += image_cluster_mean[index_cluster[k]] * index_cluster[k];
    }

    CGV_FLOAT matrix_dd = image_matrix0[0] * image_matrix1[1] - image_matrix0[1] * image_matrix0[1];

    // assert(matrix_dd !=0);
    // matrix_dd=0 means that index_cidx[k] and (Mi_-index_cidx[k]) collinear which implies only one active index;
    // taken care of separately
    if (matrix_dd == 0)
    {
        image_cluster[0] = 0.0f;
        image_cluster[1] = 0.0f;
        return FALSE;
    }

    image_matrix1[0] = image_matrix0[0];
    image_matrix0[0] = image_matrix1[1] / matrix_dd;
    image_matrix1[1] = image_matrix1[0] / matrix_dd;

    image_matrix1[0] = image_matrix0[1] = -image_matrix0[1] / matrix_dd;

    CGV_FLOAT Mif = (CGV_FLOAT)Mi_;

    // values can exceed 255 here, clamp made no diff in quality!
    image_cluster[0] = (((image_rp[0] * image_matrix0[0]) + (image_rp[1] * image_matrix0[1])) * Mif);
    image_cluster[1] = (((image_rp[0] * image_matrix1[0]) + (image_rp[1] * image_matrix1[1])) * Mif);

    return TRUE;
}

CGV_FLOAT cmp_quant_solid_color(CMP_INOUT CGU_UINT32 index_out[16],
                                CMP_INOUT CGV_Vec4ui epo_code_out[2],
                                CMP_IN CGV_Vec4ui    image_src[16],
                                CMP_IN CGU_INT       numEntries,
                                CMP_IN CGU_UINT8     Mi_,
                                CMP_IN CGU_UINT8     bits[4],
                                CMP_IN CGU_INT       type,
                                CMP_IN CGU_UINT8     channels3or4,
                                CMP_IN CGU_INT       blockMode)
{
#ifndef ASPM_GPU
#if defined(USE_NEW_SP_ERR_IDX)
    CGU_INT clogBC7 = 0;
    CGU_INT iv      = Mi_ + 1;
    while (iv >>= 1)
        clogBC7++;

    old_init_BC7ramps();  // first time call inits global
#endif
#endif

    CGU_INT    index_bits = g_modesettings[blockMode].indexBits;
    CGV_Vec4ui epo_0[2];

    epo_0[0] = 0u;
    epo_0[1] = 0u;

    CGU_UINT8 image_log = 0;
    CGU_UINT8 image_idx = 0;
    CGU_BOOL  use_par   = FALSE;
    if (type != 0)
        use_par = TRUE;
    CGV_FLOAT error_1 = CMP_FLOAT_MAX;
    //CGU_UINT8 ch;
    CGU_UINT8 ch1;
    //CGU_INT   k;
    CGU_INT i;

    for (CGU_INT pn = 0; pn < cmp_npv_nd[channels3or4 - 3][type] && (error_1 != 0.0F); pn++)
    {
        CGU_Vec4ui o1[2] = {{0u, 0u, 0u, 0u}, {2u, 2u, 2u, 2u}};
        CGU_Vec4ui o2[2] = {{0u, 0u, 0u, 0u}, {2u, 2u, 2u, 2u}};

        if (use_par == TRUE)
        {
            if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][0])
                o1[0][0] = 1;
            else
                o1[1][0] = 1;
            if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][0])
                o2[0][0] = 1;
            else
                o2[1][0] = 1;

            if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][1])
                o1[0][1] = 1;
            else
                o1[1][1] = 1;
            if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][1])
                o2[0][1] = 1;
            else
                o2[1][1] = 1;

            if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][2])
                o1[0][2] = 1;
            else
                o1[1][2] = 1;
            if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][2])
                o2[0][2] = 1;
            else
                o2[1][2] = 1;

            if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][3])
                o1[0][3] = 1;
            else
                o1[1][3] = 1;
            if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][3])
                o2[0][3] = 1;
            else
                o2[1][3] = 1;
        }
        CGU_INT   image_tcr[MAX_CHANNELS];
        CGU_INT   epo_dr_0[MAX_CHANNELS];
        CGV_FLOAT error_0 = CMP_FLOAT_MAX;

        for (CGU_UINT8 iclogBC7 = 0; iclogBC7 < (1 << index_bits) && (error_0 != 0); iclogBC7++)
        {
            CGV_FLOAT error_t = 0;
            CGU_INT   t1o[MAX_CHANNELS], t2o[MAX_CHANNELS];

            for (ch1 = 0; ch1 < channels3or4; ch1++)
            {
                // D
                CGV_FLOAT error_ta = CMP_FLOAT_MAX;

                for (CGU_UINT8 t1 = o1[0][ch1]; t1 < o1[1][ch1]; t1++)
                {
                    // C
                    // This is needed for non-integer mean points of "collapsed" sets
                    for (CGU_UINT8 t2 = o2[0][ch1]; t2 < o2[1][ch1]; t2++)
                    {
                        // B
                        CGU_INT image_tf;
                        CGU_INT image_tc;
                        image_tf = (CGU_INT)cmp_floor(image_src[0][ch1]);
                        image_tc = (CGU_INT)ceil(image_src[0][ch1]);
#ifndef ASPM_GPU
#ifdef USE_NEW_SP_ERR_IDX
                        CGV_FLOAT err_tf = old_get_sperr(clogBC7, bits[ch1], image_tf, t1, t2, iclogBC7);
                        CGV_FLOAT err_tc = old_get_sperr(clogBC7, bits[ch1], image_tc, t1, t2, iclogBC7);
                        if (err_tf > err_tc)
                            image_tcr[ch1] = image_tc;
                        else if (err_tf < err_tc)
                            image_tcr[ch1] = image_tf;
                        else
                            image_tcr[ch1] = (CGV_INT)cmp_floor(image_src[ch1][COMP_RED] + 0.5F);

                        //===============================
                        // Refine this for better quality!
                        //===============================
                        CGV_FLOAT error_tr;
                        error_tr = old_get_sperr(clogBC7, bits[ch1], image_tcr[ch1], t1, t2, iclogBC7);
                        error_tr = (error_tr * error_tr) + 2 * error_tr * old_img_absf(image_tcr[ch1] - image_src[ch1][COMP_RED]) +
                                   (image_tcr[ch1] - image_src[ch1][COMP_RED]) * (image_tcr[ch1] - image_src[ch1][COMP_RED]);

                        if (error_tr < error_ta)
                        {
                            error_ta      = error_tr;
                            t1o[ch1]      = t1;
                            t2o[ch1]      = t2;
                            epo_dr_0[ch1] = cmp_clampi(image_tcr[ch1], 0, 255);
                        }
#endif
#else
                        image_tcr[ch1] = (CGU_INT)cmp_floor(image_src[0][ch1] + 0.5F);
                        error_ta       = 0;
                        t1o[ch1]       = t1;
                        t2o[ch1]       = t2;
                        epo_dr_0[ch1]  = cmp_clampi(image_tcr[ch1], 0, 255);
#endif

                    }  // B
                }      //C

                error_t += error_ta;
            }  // D

            if (error_t <= error_0)
            {
                // We have a solid color: Use image src if on GPU
                image_log = iclogBC7;
                image_idx = image_log;

#ifndef ASPM_GPU
#ifdef USE_BC7_SP_ERR_IDX
                if (BC7EncodeRamps2.ramp_init)
                {
                    for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
                    {
                        CGV_INT index = (CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits[ch]) * 256 * 2 * 2 * 16 * 2) + (epo_dr_0[ch] * 2 * 2 * 16 * 2) +
                                        (t1o[ch] * 2 * 16 * 2) + (t2o[ch] * 16 * 2) + (iclogBC7 * 2);
                        epo_0[0][ch] = BC7EncodeRamps2.sp_idx[index + 0] & 0xFF;
                        epo_0[1][ch] = BC7EncodeRamps2.sp_idx[index + 1] & 0xFF;
                    }
                }
#endif
#else
                CGU_UINT8 ch;
                CGU_UINT8 k;
                // This needs improving
                CGV_FLOAT MinC[4] = {255, 255, 255, 255};
                CGV_FLOAT MaxC[4] = {0, 0, 0, 0};
                // get min max colors
                for (ch = 0; ch < channels3or4; ch++)
                    for (k = 0; k < numEntries; k++)
                    {
                        if (image_src[k][ch] < MinC[ch])
                            MinC[ch] = image_src[k][ch];
                        if (image_src[k][ch] > MaxC[ch])
                            MaxC[ch] = image_src[k][ch];
                    }
                epo_0[0][0] = (CGU_UINT8)MinC[0];
                epo_0[1][0] = (CGU_UINT8)MaxC[0];
                epo_0[0][1] = (CGU_UINT8)MinC[1];
                epo_0[1][1] = (CGU_UINT8)MaxC[1];
                epo_0[0][2] = (CGU_UINT8)MinC[2];
                epo_0[1][2] = (CGU_UINT8)MaxC[2];
                epo_0[0][3] = (CGU_UINT8)MinC[3];
                epo_0[1][3] = (CGU_UINT8)MaxC[3];
#endif

                error_0 = error_t;
            }

        }  // E

        if (error_0 < error_1)
        {
            image_idx       = image_log;
            epo_code_out[0] = epo_0[0];
            epo_code_out[1] = epo_0[1];
            error_1         = error_0;
        }

    }  //1

    // Get Image error
    CGV_Vec4f image_decomp[16];
    for (i = 0; i < numEntries; i++)
    {
        index_out[i] = image_idx;
        {
            image_decomp[i][0] = cmp_GetRamp(index_bits, bits[0], epo_code_out[0].x, epo_code_out[1].x, i);
            image_decomp[i][1] = cmp_GetRamp(index_bits, bits[1], epo_code_out[0].y, epo_code_out[1].y, i);
            image_decomp[i][2] = cmp_GetRamp(index_bits, bits[2], epo_code_out[0].z, epo_code_out[1].z, i);
            if (channels3or4 == 4)
                image_decomp[i][3] = cmp_GetRamp(index_bits, bits[3], epo_code_out[0].w, epo_code_out[1].w, i);
        }
    }
    // Do we need to do this rather then err_1 * numEntries
    CGV_FLOAT error_quant;
    error_quant = cmp_err_Total(image_src, image_decomp, numEntries, channels3or4);

    return error_quant;
}

INLINE CGV_FLOAT old_sq_image(CGV_FLOAT v)
{
    return v * v;
}

CGV_FLOAT cmp_shake3(CMP_INOUT CGU_Vec4ui epo_code_shake[2],
                     CMP_IN CGV_Vec4f     image_cluster[2],
                     CMP_IN CGU_UINT32    index_cidx[16],
                     CMP_IN CGV_Vec4ui    image_src[16],
                     CMP_IN CGU_INT       index_bits,
                     CMP_IN CGU_INT       type,
                     CMP_IN CGU_UINT8     max_bits[4],
                     CMP_IN CGU_UINT8     use_par,
                     CMP_IN CGU_INT       numEntries,  // max 16
                     CMP_IN CGU_UINT8     channels3or4)
{
    CGV_FLOAT best_err = CMP_FLOAT_MAX;

    CGV_FLOAT err_ed[16] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
    CGU_INT   epo_code_par[2][2][2][MAX_CHANNELS];

    for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
    {
        CGU_UINT8 ppA = 0;
        CGU_UINT8 ppB = 0;
        CGU_UINT8 rr  = (use_par ? 2 : 1);
        CGU_INT   epo_code_epi[2][2];  // first/second, coord, begin rage end range

        for (ppA = 0; ppA < rr; ppA++)
        {  // loop max =2
            for (ppB = 0; ppB < rr; ppB++)
            {  //loop  max =2

                // set default ranges
                epo_code_epi[0][0] = epo_code_epi[0][1] = cmp_ep_find_floor2(image_cluster[0][ch], max_bits[ch], use_par, ppA);
                epo_code_epi[1][0] = epo_code_epi[1][1] = cmp_ep_find_floor2(image_cluster[1][ch], max_bits[ch], use_par, ppB);

                // set begin range
                epo_code_epi[0][0] -= ((epo_code_epi[0][0] < 1 ? epo_code_epi[0][0] : 1)) & (~use_par);
                epo_code_epi[1][0] -= ((epo_code_epi[1][0] < 1 ? epo_code_epi[1][0] : 1)) & (~use_par);

                // set end range
                epo_code_epi[0][1] += ((1 << max_bits[ch]) - 1 - epo_code_epi[0][1] < 2 ? (1 << max_bits[ch]) - 1 - epo_code_epi[0][1] : 2) & (~use_par);
                epo_code_epi[1][1] += ((1 << max_bits[ch]) - 1 - epo_code_epi[1][1] < 2 ? (1 << max_bits[ch]) - 1 - epo_code_epi[1][1] : 2) & (~use_par);

                CGU_INT step                       = (1 << use_par);
                err_ed[(ppA * 8) + (ppB * 4) + ch] = CMP_FLOAT_MAX;

                for (CGU_INT epo_p1 = epo_code_epi[0][0]; epo_p1 <= epo_code_epi[0][1]; epo_p1 += step)
                {
                    for (CGU_INT epo_p2 = epo_code_epi[1][0]; epo_p2 <= epo_code_epi[1][1]; epo_p2 += step)
                    {
                        CGV_FLOAT image_square_diff = 0.0F;
                        CGU_INT   _mc               = numEntries;
                        CGV_FLOAT image_ramp;

                        while (_mc > 0)
                        {
                            image_ramp = cmp_GetRamp(index_bits, max_bits[ch], epo_p1, epo_p2, index_cidx[_mc - 1]);

                            image_square_diff += cmp_squaref(image_ramp - image_src[(_mc - 1)][ch]);
                            _mc--;
                        }
                        if (image_square_diff < err_ed[(ppA * 8) + (ppB * 4) + ch])
                        {
                            err_ed[(ppA * 8) + (ppB * 4) + ch] = image_square_diff;
                            epo_code_par[ppA][ppB][0][ch]      = epo_p1;
                            epo_code_par[ppA][ppB][1][ch]      = epo_p2;
                        }
                    }
                }
            }  // pp1
        }      // pp0
    }          // j

    //---------------------------------------------------------
    for (CGU_INT pn = 0; pn < cmp_npv_nd[channels3or4 - 3][type]; pn++)
    {
        CGV_FLOAT err_2 = 0.0F;
        CGU_INT   d1;
        CGU_INT   d2;

        for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
        {
            d1 = cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][ch];
            d2 = cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][ch];
            err_2 += err_ed[(d1 * 8) + (d2 * 4) + ch];
        }

        if (err_2 < best_err)
        {
            best_err             = err_2;
            d1                   = cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][0];
            d2                   = cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][0];
            epo_code_shake[0][0] = epo_code_par[d1][d2][0][0];
            epo_code_shake[1][0] = epo_code_par[d1][d2][1][0];
            d1                   = cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][1];
            d2                   = cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][1];
            epo_code_shake[0][1] = epo_code_par[d1][d2][0][1];
            epo_code_shake[1][1] = epo_code_par[d1][d2][1][1];
            d1                   = cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][2];
            d2                   = cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][2];
            epo_code_shake[0][2] = epo_code_par[d1][d2][0][2];
            epo_code_shake[1][2] = epo_code_par[d1][d2][1][2];
            d1                   = cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][3];
            d2                   = cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][3];
            epo_code_shake[0][3] = epo_code_par[d1][d2][0][3];
            epo_code_shake[1][3] = epo_code_par[d1][d2][1][3];
        }
    }

    return best_err;
}

CGV_FLOAT cmp_requantized_index(CMP_INOUT CGU_UINT8  index_out[16],
                                CMP_INOUT CGU_Vec4ui epo_code_best[2],
                                CMP_IN CGU_INT       index_bits,
                                CMP_IN CGU_UINT8     max_bits[4],
                                CMP_IN CGV_Vec4ui    image_src[16],
                                CMP_IN CGU_INT       numEntries,
                                CMP_IN CGU_UINT8     channels3or4)
{
    //CGV_Vec4f image_requantize[16];
    //CGV_FLOAT err_r = 0.0F;
    CGU_UINT8 k;
    CGU_UINT8 ch;

    // for (k = 0; k < 16; k++)
    // {
    //     image_requantize[k][0] = cmp_GetRamp(index_bits, max_bits[0], epo_code_best[0][0], epo_code_best[1][0], k);
    //     image_requantize[k][1] = cmp_GetRamp(index_bits, max_bits[1], epo_code_best[0][1], epo_code_best[1][1], k);
    //     image_requantize[k][2] = cmp_GetRamp(index_bits, max_bits[2], epo_code_best[0][2], epo_code_best[1][2], k);
    //     if (channels3or4 == 4)
    //         image_requantize[k][3] = cmp_GetRamp(index_bits, max_bits[3], epo_code_best[0][3], epo_code_best[1][3], k);
    //     else
    //         image_requantize[k][3] = 0.0f;
    // }

    //=========================================
    // requantized image based on new epo_code
    //=========================================
    CGV_FLOAT image_requantize[SOURCE_BLOCK_SIZE][MAX_CHANNELS];
    CGV_FLOAT err_r = 0.0F;

    for (ch = 0; ch < channels3or4; ch++)
    {
        for (k = 0; k < SOURCE_BLOCK_SIZE; k++)
        {
            image_requantize[k][ch] = cmp_GetRamp(index_bits, max_bits[ch], epo_code_best[0][ch], epo_code_best[1][ch], k);
        }
    }

    //=========================================
    // Calc the error for the requantized image
    //=========================================
    CGV_Vec4f imageDiff;

    //CGU_UINT8 block_entries = (1 << index_bits);
    //
    // for (k = 0; k < numEntries; k++)
    // {
    //     CGV_FLOAT err_cmin   = 262145.0f;  // (256 * 256 * 4) + 1;  CMP_FLOAT_MAX;
    //     CGU_UINT8 hold_index = 0;
    //     CGV_FLOAT image_err;
    //
    //     for (CGU_UINT8 k1 = 0; k1 < block_entries; k1++)
    //     {
    //         imageDiff.x = image_requantize[k1].x - image_src[k].x;
    //         imageDiff.y = image_requantize[k1].y - image_src[k].y;
    //         imageDiff.z = image_requantize[k1].z - image_src[k].z;
    //         imageDiff.w = image_requantize[k1].w - image_src[k].w;
    //         image_err   = cmp_dot4f(imageDiff, imageDiff);
    //
    //         if (image_err < err_cmin)
    //         {
    //             err_cmin   = image_err;
    //             hold_index = k1;
    //         }
    //     }
    //
    //     index_out[k] = hold_index;
    //     err_r += err_cmin;
    // }

    //=========================================
    // Calc the error for the requantized image
    //=========================================

    for (k = 0; k < numEntries; k++)
    {
        CGV_FLOAT err_cmin     = CMP_FLOAT_MAX;
        CGV_INT   hold_index_j = 0;

        for (CGV_INT iclogBC7 = 0; iclogBC7 < (1 << index_bits); iclogBC7++)
        {
            CGV_FLOAT image_err = 0.0F;

            for (ch = 0; ch < channels3or4; ch++)
            {
                image_err += old_sq_image(image_requantize[iclogBC7][ch] - image_src[k][ch]);
            }

            if (image_err < err_cmin)
            {
                err_cmin     = image_err;
                hold_index_j = iclogBC7;
            }
        }

        index_out[k] = (CGV_UINT8)hold_index_j;
        err_r += err_cmin;
    }

    return err_r;
}

CGV_FLOAT cmp_optimize_IndexAndEndPoints(CMP_INOUT CGU_Vec4ui epo_code_out[2],
                                         CMP_INOUT CGU_UINT32 index_io[16],
                                         CMP_INOUT CGU_UINT32 index_packed_out[2],
                                         CMP_IN CGV_Vec4ui    image_src[16],
                                         CMP_IN CGU_INT       numEntries,
                                         CMP_IN CGU_UINT8     Mi_,
                                         CMP_IN CGU_UINT8     bits,
                                         CMP_IN CGU_UINT8     channels3or4,
                                         CMP_IN CGU_FLOAT     errorThreshold,
                                         CMP_IN CGU_INT       blockMode)
{
    CGV_FLOAT err_best = CMP_FLOAT_MAX;
    CGU_INT   type;
    CGU_UINT8 channels2 = 2 * channels3or4;
    type                = bits % channels2;

    CGU_UINT8 use_par = (type != 0);

    CGU_UINT8 max_bits[4] = {0, 0, 0, 0};
    CGU_UINT8 ch;
    CGU_INT   k;

    for (ch = 0; ch < channels3or4; ch++)
        max_bits[ch] = (bits + channels2 - 1) / channels2;

    CGU_INT index_bits  = g_modesettings[blockMode].indexBits;
    CGU_INT clt_clogBC7 = index_bits - 2;

    if (clt_clogBC7 > 3)
        return CMP_FLOAT_MAX;

    Mi_ = Mi_ - 1;

    CGU_UINT32 index_tmp[16];
    CGU_INT    maxTry = MAX_TRY_SHAKER;

    for (k = 0; k < numEntries; k++)
        index_tmp[k] = cmp_clampui8(index_io[k], 0, 15);

    epo_code_out[0] = 0u;
    epo_code_out[1] = 0u;

    CGV_FLOAT err_requant = 0.0F;

    CGU_UINT8 MaxIndex;

    MaxIndex = index_collapse2(index_tmp, numEntries);

    //===============================
    // we have a solid color 4x4 block
    //===============================
    if (MaxIndex == 0)
    {
        return cmp_quant_solid_color(index_io, epo_code_out, image_src, numEntries, Mi_, max_bits, type, channels3or4, blockMode);
    }

    for (CGU_INT ii = 0; ii < maxTry; ii++)
    {
        //===============================
        // We have ramp colors to process
        //===============================
        CGV_FLOAT  err_cluster = CMP_FLOAT_MAX;
        CGV_FLOAT  err_shake;
        CGU_UINT32 index_cluster[16];
        CGU_Vec4ui epo_code_best[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}};

        for (CGU_UINT8 ii2 = 0; ii2 < numEntries; ii2++)
            index_cluster[ii2] = 0;
        CGU_UINT8 mi = Mi_;

        for (CGU_UINT8 index_slope = 1; (index_slope * MaxIndex) <= mi; index_slope++)
        {
            CGV_Vec4f image_cluster[2] = {{0.0f, 0.0f, 0.0f, 0.0f}, {0.0f, 0.0f, 0.0f, 0.0f}};

            for (CGU_UINT8 index_offset = 0; index_offset <= (mi - index_slope * MaxIndex); index_offset++)
            {
                //-------------------------------------
                // set a new index data to try
                //-------------------------------------
                for (k = 0; k < numEntries; k++)
                    index_cluster[k] = index_tmp[k] * index_slope + index_offset;

                if (cmp_get_ideal_cluster(image_cluster, index_cluster, Mi_, image_src, numEntries, channels3or4))
                {
                    CGU_Vec4ui epo_code_shake[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}};

                    err_shake =
                        cmp_shake3(epo_code_shake, image_cluster, index_cluster, image_src, index_bits, type, max_bits, use_par, numEntries, channels3or4);

                    if (err_shake < err_cluster)
                    {
                        err_cluster      = err_shake;
                        epo_code_best[0] = epo_code_shake[0];
                        epo_code_best[1] = epo_code_shake[1];
                    }
                }
            }
        }

        if ((err_cluster != CMP_FLOAT_MAX))
        {
            //=========================
            // test results for quality
            //=========================
            CGU_UINT8 index_best[16] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
            err_requant              = cmp_requantized_index(index_best, epo_code_best, index_bits, max_bits, image_src, numEntries, channels3or4);
            if (err_requant < err_best)
            {
                //better = 1;
                for (k = 0; k < numEntries; k++)
                    index_io[k] = index_tmp[k] = index_best[k];

                cmp_pack4bitindex32(index_packed_out, index_io);
                epo_code_out[0] = epo_code_best[0];
                epo_code_out[1] = epo_code_best[1];
                err_best        = err_requant;
            }
        }

        // Early out if we have our target err
        if (err_best <= errorThreshold)
            break;

        MaxIndex = index_collapse2(index_tmp, numEntries);
        if (MaxIndex == 0)
            break;
    }

    return err_best;
}

CGU_UINT8 cmp_Write8Bit2(CMP_INOUT CGU_UINT8 base[16], CMP_IN CGU_INT offset, CMP_IN CGU_INT bits, CMP_IN CGU_UINT8 bitVal)
{
    base[offset / 8] |= bitVal << (offset % 8);
    if (offset % 8 + bits > 8)
    {
        base[offset / 8 + 1] |= shift_right_uint82(bitVal, 8 - offset % 8);
    }
    return (offset += bits);
}

INLINE CGU_UINT8 shift_right_uint8V2(CMP_IN CGU_UINT8 v, CMP_IN CGU_UINT8 bits)
{
    return v >> bits;  // (perf warning expected)
}

void cmp_Write8BitV2(CMP_INOUT CGU_UINT8 base[16], CMP_IN CGU_INT offset, CMP_IN CGU_INT bits, CMP_IN CGU_UINT8 bitVal)
{
    base[offset / 8] |= bitVal << (offset % 8);
    if (offset % 8 + bits > 8)
    {
        base[offset / 8 + 1] |= shift_right_uint8V2(bitVal, 8 - offset % 8);
    }
}

void cmp_Encode_mode01237(CMP_IN CGU_INT      blockMode,
                          CMP_IN CGU_UINT8    bestPartition,
                          CMP_IN CGU_UINT32   packedEndpoints[6],
                          CMP_IN CGU_UINT8    index16[16],
                          CMP_INOUT CGU_UINT8 cmp_out[16])
{
    CGU_UINT8  blockindex[SOURCE_BLOCK_SIZE];
    CGU_UINT32 indexBitsV = g_modesettings[blockMode].indexBits;
    CGU_UINT32 k;
    CGU_UINT32 ch;

    for (k = 0; k < COMPRESSED_BLOCK_SIZE; k++)
        cmp_out[k] = 0;

    // mode 0 = 1, mode 1 = 01, mode 2 = 001, mode 3 = 0001, ...
    CGU_INT bitPosition = blockMode;
    bitPosition         = cmp_Write8Bit2(cmp_out, bitPosition, 1, 1);

    // Write partition bits
    bitPosition = cmp_Write8Bit2(cmp_out, bitPosition, g_modesettings[blockMode].partitionBits, bestPartition);

    // Sort out the index set and tag whether we need to flip the
    // endpoints to get the correct state in the implicit index bits
    // The implicitly encoded MSB of the fixup index must be 0
    CGU_UINT32 fixup[3] = {0, 0, 0};
    cmp_get_fixuptable(fixup, (g_modesettings[blockMode].maxSubSets == 2 ? bestPartition : bestPartition + 64));

    // Extract indices and mark subsets that need to have their colours flipped to get the
    // right state for the implicit MSB of the fixup index
    CGU_INT flipColours[3] = {0, 0, 0};

    for (k = 0; k < SOURCE_BLOCK_SIZE; k++)
    {
        blockindex[k] = index16[k];
        for (CGU_UINT8 j = 0; j < g_modesettings[blockMode].maxSubSets; j++)
        {
            if (k == fixup[j])
            {
                if (blockindex[k] & (1 << (indexBitsV - 1)))
                {
                    flipColours[j] = 1;
                }
            }
        }
    }

    // Now we must flip the endpoints where necessary so that the implicitly encoded
    // index bits have the correct state
    for (k = 0; k < g_modesettings[blockMode].maxSubSets; k++)
    {
        if (flipColours[k] == 1)
        {
            CGU_UINT32 temp            = packedEndpoints[k * 2 + 0];
            packedEndpoints[k * 2 + 0] = packedEndpoints[k * 2 + 1];
            packedEndpoints[k * 2 + 1] = temp;
        }
    }

    // ...next flip the indices where necessary

    for (k = 0; k < SOURCE_BLOCK_SIZE; k++)
    {
        CGU_UINT8 partsub = cmp_get_partition_subset2(bestPartition, g_modesettings[blockMode].maxSubSets, k);

        if (flipColours[partsub] == 1)
        {
            blockindex[k] = ((1 << indexBitsV) - 1) - blockindex[k];
        }
    }

    // Endpoints are stored in the following order RRRR GGGG BBBB (AAAA) (PPPP)
    // i.e. components are packed together
    CGU_Vec4ui unpackedColours[MAX_SUBSETS * 2];
    CGU_UINT8  parityBits[MAX_SUBSETS][2];

    // Init
    for (k = 0; k < MAX_SUBSETS * 2; k++)
        unpackedColours[k] = 0;

    // Unpack the colour values for the subsets
    for (k = 0; k < g_modesettings[blockMode].maxSubSets; k++)
    {
        CGU_UINT32 packedColours[2] = {packedEndpoints[k * 2 + 0], packedEndpoints[k * 2 + 1]};

        if (blockMode == 0 || blockMode == 3 || blockMode == 7)
        {  // TWO_PBIT
            parityBits[k][0] = packedColours[0] & 1;
            parityBits[k][1] = packedColours[1] & 1;
            packedColours[0] >>= 1;
            packedColours[1] >>= 1;
        }
        else if (blockMode == 1)
        {  // ONE_PBIT
            parityBits[k][0] = packedColours[1] & 1;
            parityBits[k][1] = packedColours[1] & 1;
            packedColours[0] >>= 1;
            packedColours[1] >>= 1;
        }
        else if (blockMode == 2)
        {
            parityBits[k][0] = 0;
            parityBits[k][1] = 0;
        }

        for (ch = 0; ch < g_modesettings[blockMode].channels3or4; ch++)
        {
            unpackedColours[k * 2][ch]     = packedColours[0] & ((1 << g_modesettings[blockMode].componentBits) - 1);
            unpackedColours[k * 2 + 1][ch] = packedColours[1] & ((1 << g_modesettings[blockMode].componentBits) - 1);
            packedColours[0] >>= g_modesettings[blockMode].componentBits;
            packedColours[1] >>= g_modesettings[blockMode].componentBits;
        }
    }

    // Loop over component
    for (ch = 0; ch < g_modesettings[blockMode].channels3or4; ch++)
    {
        // loop over subsets
        for (k = 0; k < g_modesettings[blockMode].maxSubSets; k++)
        {
            bitPosition = cmp_Write8Bit2(cmp_out, bitPosition, g_modesettings[blockMode].componentBits, unpackedColours[k * 2][ch] & 0xFF);
            bitPosition = cmp_Write8Bit2(cmp_out, bitPosition, g_modesettings[blockMode].componentBits, unpackedColours[k * 2 + 1][ch] & 0xFF);
        }
    }

    // write parity bits
    if (blockMode != 2)
    {
        for (k = 0; k < g_modesettings[blockMode].maxSubSets; k++)
        {
            if (blockMode == 1)
            {  // ONE_PBIT
                bitPosition = cmp_Write8Bit2(cmp_out, bitPosition, 1, parityBits[k][0] & 0x01);
            }
            else
            {  // TWO_PBIT
                bitPosition = cmp_Write8Bit2(cmp_out, bitPosition, 1, parityBits[k][0] & 0x01);
                bitPosition = cmp_Write8Bit2(cmp_out, bitPosition, 1, parityBits[k][1] & 0x01);
            }
        }
    }

    // Encode the index bits
    CGU_INT bitPositionV = bitPosition;
    for (k = 0; k < 16; k++)
    {
        CGU_UINT8 partsub = cmp_get_partition_subset2(bestPartition, g_modesettings[blockMode].maxSubSets, k);

        // If this is a fixup index then drop the MSB which is implicitly 0
        if (k == fixup[partsub])
        {
            cmp_Write8BitV2(cmp_out, bitPositionV, g_modesettings[blockMode].indexBits - 1, blockindex[k] & 0x07F);
            bitPositionV += g_modesettings[blockMode].indexBits - 1;
        }
        else
        {
            cmp_Write8BitV2(cmp_out, bitPositionV, g_modesettings[blockMode].indexBits, blockindex[k]);
            bitPositionV += g_modesettings[blockMode].indexBits;
        }
    }
}

CGV_FLOAT cmp_process_mode(CMP_INOUT CGU_UINT32 best_cmp_out[5], CMP_IN CGU_Vec4ui image_src[16], CMP_IN CGU_INT block_mode)
{
#ifdef USE_OLDCODE
    CGV_FLOAT  best_err = 1e30f;
    CGU_Vec4ui epo_code[6];
    CGU_Vec4ui bestEndpoints[6];
    CGU_UINT8  bestindex[3][16];
    CGU_INT    bestEntryCount[3];
    CGU_UINT8  bestindex16[16];
    CGU_UINT32 packedEndpoints[6] = {0, 0, 0, 0, 0, 0};

    CGU_UINT32 k;
    CGU_UINT32 ch;
    CGU_UINT32 subset;

    // Check for a solid color for a fast encode
    CGV_Vec4ui mean_out = 0.0f;

    for (k = 0; k < 16; k++)
    {
        mean_out       = mean_out + image_src[k];
        bestindex16[k] = 0;
    }

    mean_out = mean_out / 16;

    // Image has alpha
    if (mean_out.w < 255)
    {
    }

    CGU_UINT8 storedBestindex[64][3][16];
    CGV_FLOAT storedError[64];
    CGU_UINT8 sortedPartition[64];

    CGV_FLOAT  quality    = 1.0f;
    CGV_FLOAT  opaque_err = 0.0f;
    CGV_Vec4ui image_subsets[3][16];
    CGU_INT    subset_entryCount[MAX_SUBSETS] = {0, 0, 0};
    CGU_UINT8  bestPartition                  = 0;

    for (CGU_UINT8 mode_blockPartition = 0; mode_blockPartition < 64; mode_blockPartition++)
    {
        cmp_GetPartitionSubSet2_mode01237(
            image_subsets, subset_entryCount, mode_blockPartition, image_src, block_mode, g_modesettings[block_mode].channels3or4);

        CGV_Vec4ui subset_image_src[16];
        CGU_UINT8  index_out1[16];
        CGV_FLOAT  err_quant = 0.0F;

        // Store the quntize error for this partition to be sorted and processed later
        for (subset = 0; subset < g_modesettings[block_mode].maxSubSets; subset++)
        {
            CGU_INT numEntries = subset_entryCount[subset];

            for (CGU_UINT8 ii = 0; ii < 16; ii++)
                subset_image_src[ii] = image_subsets[subset][ii];

            err_quant += cmp_GetQuantizeIndex_old(
                index_out1, subset_image_src, numEntries, g_modesettings[block_mode].clusters, g_modesettings[block_mode].channels3or4);

            for (CGU_UINT8 idx = 0; idx < numEntries; idx++)
                storedBestindex[mode_blockPartition][subset][idx] = index_out1[idx];
        }

        storedError[mode_blockPartition] = err_quant;
    }

    // Sort the results
    cmp_sortPartitionProjection(storedError, sortedPartition, 64);  // 64 partitions

    CGU_UINT8 numShakeAttempts = cmp_max8(1, cmp_min8((CGU_UINT8)cmp_floor(8 * quality + 0.5), 64));  // 64 partitions
    CGV_FLOAT err_best         = CMP_FLOAT_MAX;

    // Now do the endpoint shaking
    for (CGU_UINT8 nSA = 0; nSA < numShakeAttempts; nSA++)
    {
        CGV_FLOAT err_optimized = 0.0F;
        CGU_UINT8 sortedBlockPartition;
        sortedBlockPartition = sortedPartition[nSA];

        //********************************************
        // Get the partition shape for the given mode
        //********************************************
        cmp_GetPartitionSubSet2_mode01237(
            image_subsets, subset_entryCount, sortedBlockPartition, image_src, block_mode, g_modesettings[block_mode].channels3or4);

        //*****************************
        // Process the partition shape
        //*****************************
        for (subset = 0; subset < g_modesettings[block_mode].maxSubSets; subset++)
        {
            CGU_INT    numEntries = subset_entryCount[subset];
            CGU_UINT32 index_io[16];
            CGV_Vec4ui src_image_block[16];
            CGU_Vec4ui tmp_epo_code[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}};

            for (k = 0; k < 16; k++)
                src_image_block[k] = image_subsets[subset][k];

            for (k = 0; k < 16; k++)
                index_io[k] = storedBestindex[sortedBlockPartition][subset][k];

            CGU_UINT32 index_packed_out[2] = {0, 0};

            err_optimized += cmp_optimize_IndexAndEndPoints(tmp_epo_code,
                                                            index_io,
                                                            index_packed_out,
                                                            src_image_block,
                                                            numEntries,
                                                            g_modesettings[block_mode].clusters,
                                                            g_modesettings[block_mode].bits,
                                                            g_modesettings[block_mode].channels3or4,
                                                            0.01f,
                                                            1);

            for (k = 0; k < 16; k++)
                storedBestindex[sortedBlockPartition][subset][k] = index_io[k];

            epo_code[subset * 2]     = tmp_epo_code[0];
            epo_code[subset * 2 + 1] = tmp_epo_code[1];

            shared_temp[subset * 2].endPoint_low      = tmp_epo_code[0];
            shared_temp[subset * 2 + 1].endPoint_high = tmp_epo_code[1];
        }

        //****************************************
        // Check if result is better than the last
        //****************************************
        if (err_optimized < err_best)
        {
            bestPartition          = sortedBlockPartition;
            CGU_INT bestIndexCount = 0;

            for (subset = 0; subset < g_modesettings[block_mode].maxSubSets; subset++)
            {
                CGU_UINT32 numEntries  = subset_entryCount[subset];
                bestEntryCount[subset] = numEntries;

                if (numEntries)
                {
                    bestEndpoints[subset * 2]     = epo_code[subset * 2];
                    bestEndpoints[subset * 2 + 1] = epo_code[subset * 2 + 1];

                    shared_temp[subset * 2].endPoint_low      = bestEndpoints[subset * 2];
                    shared_temp[subset * 2 + 1].endPoint_high = bestEndpoints[subset * 2 + 1];

                    for (k = 0; k < numEntries; k++)
                    {
                        bestindex[subset][k]          = storedBestindex[sortedBlockPartition][subset][k];
                        bestindex16[bestIndexCount++] = storedBestindex[sortedBlockPartition][subset][k];
                        shared_temp[k].colorindex     = storedBestindex[sortedBlockPartition][subset][k];
                    }
                }
            }

            err_best = err_optimized;
            // Early out if we  found we can compress with error below the quality threshold
            if (err_best <= 0.01f)  // Thresh hold err
            {
                break;
            }
        }
    }

    if (block_mode != 7)
        err_best += opaque_err;

    if (err_best > best_err)
        return best_err;

    //**************************
    // Save the encoded block
    //**************************
    best_err = err_best;

    // Now we have all the data needed to encode the block
    // We need to pack the endpoints prior to encoding

    for (subset = 0; subset < g_modesettings[block_mode].maxSubSets; subset++)
    {
        packedEndpoints[subset * 2]     = 0;
        packedEndpoints[subset * 2 + 1] = 0;

        if (bestEntryCount[subset])
        {
            CGU_UINT32 rightAlignment = 0;

            // Sort out parity bits
            if (block_mode != 2)
            {
                // Sort out BCC parity bits
                packedEndpoints[subset * 2]     = bestEndpoints[subset * 2][0] & 1;
                packedEndpoints[subset * 2 + 1] = bestEndpoints[subset * 2 + 1][0] & 1;
                for (ch = 0; ch < g_modesettings[block_mode].channels3or4; ch++)
                {
                    bestEndpoints[subset * 2][ch] >>= 1;
                    bestEndpoints[subset * 2 + 1][ch] >>= 1;
                }
                rightAlignment++;
            }

            // Fixup endpoints
            for (ch = 0; ch < g_modesettings[block_mode].channels3or4; ch++)
            {
                packedEndpoints[subset * 2] |= bestEndpoints[subset * 2][ch] << rightAlignment;
                packedEndpoints[subset * 2 + 1] |= bestEndpoints[subset * 2 + 1][ch] << rightAlignment;
                rightAlignment += g_modesettings[block_mode].componentBits;
            }
        }
    }

    CGU_UINT8 idxCount[3] = {0, 0, 0};
    for (k = 0; k < SOURCE_BLOCK_SIZE; k++)
    {
        CGU_UINT8 partsub         = cmp_get_partition_subset2(bestPartition, g_modesettings[block_mode].maxSubSets, k);
        CGU_UINT8 idxC            = idxCount[partsub];
        bestindex16[k]            = bestindex[partsub][idxC];
        idxCount[partsub]         = idxC + 1;
        shared_temp[k].colorindex = bestindex16[k];
    }

    CGU_UINT8 cmp_out[COMPRESSED_BLOCK_SIZE];
    cmp_Encode_mode01237(block_mode, bestPartition, packedEndpoints, bestindex16, cmp_out);

    best_cmp_out[0] = (CGU_UINT32)cmp_out[0] + (CGU_UINT32)(cmp_out[1] << 8) + (CGU_UINT32)(cmp_out[2] << 16) + (CGU_UINT32)(cmp_out[3] << 24);
    best_cmp_out[1] = (CGU_UINT32)cmp_out[4] + (CGU_UINT32)(cmp_out[5] << 8) + (CGU_UINT32)(cmp_out[6] << 16) + (CGU_UINT32)(cmp_out[7] << 24);
    best_cmp_out[2] = (CGU_UINT32)cmp_out[8] + (CGU_UINT32)(cmp_out[9] << 8) + (CGU_UINT32)(cmp_out[10] << 16) + (CGU_UINT32)(cmp_out[11] << 24);
    best_cmp_out[3] = (CGU_UINT32)cmp_out[12] + (CGU_UINT32)(cmp_out[13] << 8) + (CGU_UINT32)(cmp_out[14] << 16) + (CGU_UINT32)(cmp_out[15] << 24);

    //CGU_Vec4ui block = {0, 0, 0, 0};
    //block_package1(block, bestPartition, 0);
    //best_cmp_out[0] = block[0];
    //best_cmp_out[1] = block[1];
    //best_cmp_out[2] = block[2];
    //best_cmp_out[3] = block[3];
    //
    //printSharedTemp();

    return best_err;
#else
    CGU_UINT8 bestPartition = 0;
    // Find the best partion
    CGU_UINT32 pbit = 0;
    CGU_UINT32 error;
    CGU_UINT32 bestErr  = MAX_UINT;
    CGU_UINT32 bestpbit = 0;
    for (CGU_UINT8 mode_blockPartition = 0; mode_blockPartition < 64; mode_blockPartition++)
    {
        error = cmp_GetPartitionError(pbit, mode_blockPartition, image_src);
        if (error < bestErr)
        {
            bestErr       = error;
            bestpbit      = pbit;
            bestPartition = mode_blockPartition;
        }
    }

    // Get the index for the partition
    for (CGU_INT threadInBlock = 15; threadInBlock >= 0; threadInBlock--)
    {
        ProcessBlock(1, bestPartition, 0, bestpbit, 0, threadInBlock, threadInBlock, 0);
    }

    // print results for debug
    printSharedTemp();

    //=======================
    // Encode final block
    //========================
    {
        // CGU_Vec4ui blockGreen = {0xffe00040, 0xfffe0007, 0x00000001, 0x00000000};
        // CGU_Vec4ui blockBlue  = {0x00000040, 0xfffffff8, 0x00000001, 0x00000000};
        // CGU_Vec4ui block00    = {0xf0617fc0, 0xfffe0c3f, 0xff00fe11, 0xff01ef00};
        CGU_Vec4ui blockRed   = {0x001fffc0, 0xfffe0000, 0x00000001, 0x00000000};
        CGU_Vec4ui block      = {0, 0, 0, 0};
        CGU_UINT32 input_mode = 1;
        switch (input_mode)
        {
        case 1:
            block_package1(block, bestPartition, 0);
            break;
        case 3:
            block_package3(block, bestPartition, 0);
            break;
        case 7:
            block_package7(block, bestPartition, 0);
            break;
        default:  // error unsupported mode used!
            block = blockRed;
            break;
        }

        best_cmp_out[0] = block[0];
        best_cmp_out[1] = block[1];
        best_cmp_out[2] = block[2];
        best_cmp_out[3] = block[3];
    }

    return 0.0f;
#endif
}
#endif  // Not ASPM_HLSL

//======================================= MODES 45 =============================================
#ifndef ASPM_HLSL
#if defined(ENABLE_CMP_MODE4) || defined(ENABLE_CMP_MODE5)

// Compression Results
struct cmp_mode_parameters2
{
    CGV_INT    color_qendpoint[8];
    CGV_INT    alpha_qendpoint[8];
    CGV_UINT8  color_index[16];
    CGV_UINT8  alpha_index[16];
    CGV_UINT32 idxMode;
    CGV_UINT32 rotated_channel;
};

CMP_STATIC CMP_CONSTANT CGU_UINT8 componentRotations2[4][4] = {{COMP_ALPHA, COMP_RED, COMP_GREEN, COMP_BLUE},
                                                               {COMP_RED, COMP_ALPHA, COMP_GREEN, COMP_BLUE},
                                                               {COMP_GREEN, COMP_RED, COMP_ALPHA, COMP_BLUE},
                                                               {COMP_BLUE, COMP_RED, COMP_GREEN, COMP_ALPHA}};

INLINE CGV_UINT8 old_shift_right_uint(CGV_UINT8 v, CGU_UINT8 bits)
{
    return v >> bits;  // (perf warning expected)
}

void old_Write8Bit(CGV_UINT8 base[], CGU_INT* uniform offset, CGU_INT bits, CGV_UINT8 bitVal)
{
    base[*offset / 8] |= bitVal << (*offset % 8);
    if (*offset % 8 + bits > 8)
    {
        base[*offset / 8 + 1] |= old_shift_right_uint(bitVal, 8 - *offset % 8);
    }
    *offset += bits;
}

INLINE void old_swap_index(CGV_UINT8 u[], CGV_UINT8 v[], CGU_INT n)
{
    for (CGU_INT i = 0; i < n; i++)
    {
        CGV_UINT8 t = u[i];
        u[i]        = v[i];
        v[i]        = t;
    }
}

INLINE void old_swap_epo(CGV_INT u[], CGV_INT v[], CGV_INT n)
{
    for (CGU_INT i = 0; i < n; i++)
    {
        CGV_INT t = u[i];
        u[i]      = v[i];
        v[i]      = t;
    }
}

INLINE void old_encode_swap(CGV_INT endpoint[], CGU_INT channels, CGV_UINT8 block_index[MAX_SUBSET_SIZE], CGU_INT bits)
{
    CGU_INT levels = 1 << bits;
    if (block_index[0] >= levels / 2)
    {
        old_swap_epo(&endpoint[0], &endpoint[channels], channels);
        for (CGU_INT k = 0; k < SOURCE_BLOCK_SIZE; k++)
#ifdef ASPM_GPU
            block_index[k] = (levels - 1) - block_index[k];
#else
            block_index[k] = CGV_UINT8(levels - 1) - block_index[k];
#endif
    }
}

void old_encode_index(CGV_UINT8 data[16], CGU_INT* uniform pPos, CGV_UINT8 block_index[MAX_SUBSET_SIZE], CGU_INT bits)
{
    old_Write8Bit(data, pPos, bits - 1, block_index[0]);
    for (CGU_INT j = 1; j < SOURCE_BLOCK_SIZE; j++)
    {
        CGV_UINT8 qbits = block_index[j] & 0xFF;
        old_Write8Bit(data, pPos, bits, qbits);
    }
}

void cmp_Encode_mode4(CMP_INOUT CGV_UINT8 cmp_out[COMPRESSED_BLOCK_SIZE], cmp_mode_parameters2 params)
{
    CGU_INT bitPosition = 4;  // Position the pointer at the LSB

    for (CGU_INT k = 0; k < COMPRESSED_BLOCK_SIZE; k++)
        cmp_out[k] = 0;

    // mode 4 (5 bits) 00001
    old_Write8Bit(cmp_out, &bitPosition, 1, 1);

    // rotation 2 bits
    old_Write8Bit(cmp_out, &bitPosition, 2, CMP_STATIC_CAST(CGV_UINT8, params.rotated_channel));

    // idxMode 1 bit
    old_Write8Bit(cmp_out, &bitPosition, 1, CMP_STATIC_CAST(CGV_UINT8, params.idxMode));

    CGU_INT idxBits[2] = {2, 3};

    if (params.idxMode)
    {
        idxBits[0] = 3;
        idxBits[1] = 2;
        // Indicate if we need to fixup the index
        old_swap_index(params.color_index, params.alpha_index, 16);
        old_encode_swap(params.alpha_qendpoint, 4, params.color_index, 2);
        old_encode_swap(params.color_qendpoint, 4, params.alpha_index, 3);
    }
    else
    {
        old_encode_swap(params.color_qendpoint, 4, params.color_index, 2);
        old_encode_swap(params.alpha_qendpoint, 4, params.alpha_index, 3);
    }

    // color endpoints 5 bits each
    // R0 : R1
    // G0 : G1
    // B0 : B1
    for (CGU_INT component = 0; component < 3; component++)
    {
        old_Write8Bit(cmp_out, &bitPosition, 5, CMP_STATIC_CAST(CGV_UINT8, params.color_qendpoint[component]));
        old_Write8Bit(cmp_out, &bitPosition, 5, CMP_STATIC_CAST(CGV_UINT8, params.color_qendpoint[4 + component]));
    }

    // alpha endpoints (6 bits each)
    // A0 : A1
    old_Write8Bit(cmp_out, &bitPosition, 6, CMP_STATIC_CAST(CGV_UINT8, params.alpha_qendpoint[0]));
    old_Write8Bit(cmp_out, &bitPosition, 6, CMP_STATIC_CAST(CGV_UINT8, params.alpha_qendpoint[4]));

    // index 2 bits each  (31 bits total)
    old_encode_index(cmp_out, &bitPosition, params.color_index, 2);
    // index 3 bits each  (47 bits total)
    old_encode_index(cmp_out, &bitPosition, params.alpha_index, 3);
}

void cmp_Encode_mode5(CMP_INOUT CGV_UINT8 cmp_out[COMPRESSED_BLOCK_SIZE], cmp_mode_parameters2 params)
{
    for (CGU_INT k = 0; k < COMPRESSED_BLOCK_SIZE; k++)
        cmp_out[k] = 0;

    // mode 5 bits = 000001
    CGU_INT bitPosition = 5;  // Position the pointer at the LSB
    old_Write8Bit(cmp_out, &bitPosition, 1, 1);

    // Write 2 bit rotation
    old_Write8Bit(cmp_out, &bitPosition, 2, CMP_STATIC_CAST(CGV_UINT8, params.rotated_channel));

    old_encode_swap(params.color_qendpoint, 4, params.color_index, 2);
    old_encode_swap(params.alpha_qendpoint, 4, params.alpha_index, 2);

    // color endpoints (7 bits each)
    // R0 : R1
    // G0 : G1
    // B0 : B1
    for (CGU_INT component = 0; component < 3; component++)
    {
        old_Write8Bit(cmp_out, &bitPosition, 7, CMP_STATIC_CAST(CGV_UINT8, params.color_qendpoint[component]));
        old_Write8Bit(cmp_out, &bitPosition, 7, CMP_STATIC_CAST(CGV_UINT8, params.color_qendpoint[4 + component]));
    }

    // alpha endpoints (8 bits each)
    // A0 : A1
    old_Write8Bit(cmp_out, &bitPosition, 8, CMP_STATIC_CAST(CGV_UINT8, params.alpha_qendpoint[0]));
    old_Write8Bit(cmp_out, &bitPosition, 8, CMP_STATIC_CAST(CGV_UINT8, params.alpha_qendpoint[4]));

    // color index 2 bits each  (31 bits total)
    // alpha index 2 bits each  (31 bits total)
    old_encode_index(cmp_out, &bitPosition, params.color_index, 2);
    old_encode_index(cmp_out, &bitPosition, params.alpha_index, 2);
}

void Compress_mode45(CMP_INOUT CGU_UINT32 cmp_out[4], CGU_INT blockMode, CGU_Vec4ui image_src[SOURCE_BLOCK_SIZE])
{
    cmp_mode_parameters2 best_candidate;
    CGU_UINT32           channels3or4 = 4;
    CGU_UINT8            numClusters0[2];
    CGU_UINT8            numClusters1[2];
    CGU_INT              modeBits[2];
    CGU_INT              max_idxMode;

    if (blockMode == 4)
    {
        max_idxMode     = 2;
        modeBits[0]     = 30;  // bits = 2 * (Red 5+ Grn 5+ blu 5)
        modeBits[1]     = 36;  // bits = 2 * (Alpha 6+6+6)
        numClusters0[0] = 4;
        numClusters0[1] = 8;
        numClusters1[0] = 8;
        numClusters1[1] = 4;
    }
    else
    {
        max_idxMode     = 1;
        modeBits[0]     = 42;  // bits = 2 * (Red 7+ Grn 7+ blu 7)
        modeBits[1]     = 48;  // bits = 2 * (Alpha 8+8+8) = 48
        numClusters0[0] = 4;
        numClusters0[1] = 4;
        numClusters1[0] = 4;
        numClusters1[1] = 4;
    }

    CGU_Vec4ui src_color_Block[SOURCE_BLOCK_SIZE];
    CGU_Vec4ui src_alpha_Block[SOURCE_BLOCK_SIZE];
    CGV_FLOAT  best_err = CMP_FLOAT_MAX;

    // Go through each possible rotation and selection of index rotationBits)
    for (CGU_UINT8 rotated_channel = 0; rotated_channel < channels3or4; rotated_channel++)
    {
        // A

        for (CGU_INT k = 0; k < SOURCE_BLOCK_SIZE; k++)
        {
            for (CGU_INT p = 0; p < 3; p++)
            {
                src_color_Block[k][p] = image_src[k][componentRotations2[rotated_channel][p + 1]];
                src_alpha_Block[k][p] = image_src[k][componentRotations2[rotated_channel][0]];
            }
            src_color_Block[k][3] = image_src[k][3];
            src_alpha_Block[k][3] = image_src[k][componentRotations2[3][3]];
        }

        CGV_FLOAT err_quantizer;
        CGV_FLOAT err_bestQuantizer = CMP_FLOAT_MAX;

        for (CGU_INT idxMode = 0; idxMode < max_idxMode; idxMode++)
        {
            err_quantizer = cmp_GetQuantizeIndex_old(best_candidate.color_index, src_color_Block, SOURCE_BLOCK_SIZE, numClusters0[idxMode], 3);

            err_quantizer += cmp_GetQuantizeIndex_old(best_candidate.alpha_index, src_alpha_Block, SOURCE_BLOCK_SIZE, numClusters1[idxMode], 3) / 3.0F;

            // If quality is high then run the full shaking for this config and
            // store the result if it beats the best overall error
            // Otherwise only run the shaking if the error is better than the best
            // quantizer error
            if (err_quantizer <= err_bestQuantizer)
            {
                err_bestQuantizer = err_quantizer;

                // Shake size gives the size of the shake cube
                CGV_FLOAT err_overallError;

                CGU_Vec4ui color_qendpoint2[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}};
                CGV_Vec4ui src_image_block[16];
                CGU_Vec4ui alpha_qendpoint2[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}};

                CGU_UINT32 alpha_index[16];
                CGU_UINT32 color_index[16];

                for (int k = 0; k < 16; k++)
                {
                    alpha_index[k] = best_candidate.alpha_index[k];
                    color_index[k] = best_candidate.color_index[k];
                }

                CGU_UINT32 color_index_packed_out[2] = {0, 0};
                CGU_UINT32 alpha_index_packed_out[2] = {0, 0};

                err_overallError = cmp_optimize_IndexAndEndPoints(
                    color_qendpoint2, color_index, color_index_packed_out, src_color_Block, 16, numClusters0[idxMode], modeBits[0], 3, 0.01f, blockMode);

                // Alpha scalar block
                err_overallError +=
                    cmp_optimize_IndexAndEndPoints(
                        alpha_qendpoint2, alpha_index, alpha_index_packed_out, src_alpha_Block, 16, numClusters1[idxMode], modeBits[1], 3, 0.01f, blockMode) /
                    3;

                // If we beat the previous best then encode the block
                if (err_overallError < best_err)
                {
                    best_err                       = err_overallError;
                    best_candidate.idxMode         = idxMode;
                    best_candidate.rotated_channel = rotated_channel;

                    best_candidate.alpha_qendpoint[0] = alpha_qendpoint2[0].x;
                    best_candidate.alpha_qendpoint[1] = alpha_qendpoint2[0].y;
                    best_candidate.alpha_qendpoint[2] = alpha_qendpoint2[0].z;
                    best_candidate.alpha_qendpoint[3] = alpha_qendpoint2[0].w;
                    best_candidate.alpha_qendpoint[4] = alpha_qendpoint2[1].x;
                    best_candidate.alpha_qendpoint[5] = alpha_qendpoint2[1].y;
                    best_candidate.alpha_qendpoint[6] = alpha_qendpoint2[1].z;
                    best_candidate.alpha_qendpoint[7] = alpha_qendpoint2[1].w;

                    best_candidate.color_qendpoint[0] = color_qendpoint2[0].x;
                    best_candidate.color_qendpoint[1] = color_qendpoint2[0].y;
                    best_candidate.color_qendpoint[2] = color_qendpoint2[0].z;
                    best_candidate.color_qendpoint[3] = color_qendpoint2[0].w;
                    best_candidate.color_qendpoint[4] = color_qendpoint2[1].x;
                    best_candidate.color_qendpoint[5] = color_qendpoint2[1].y;
                    best_candidate.color_qendpoint[6] = color_qendpoint2[1].z;
                    best_candidate.color_qendpoint[7] = color_qendpoint2[1].w;

                    for (int k = 0; k < 16; k++)
                    {
                        best_candidate.color_index[k] = color_index[k];
                        best_candidate.alpha_index[k] = alpha_index[k];
                    }

                    CGV_UINT8 cmp_out16[COMPRESSED_BLOCK_SIZE];
                    if (blockMode == 4)
                        cmp_Encode_mode4(cmp_out16, best_candidate);
                    else
                        cmp_Encode_mode5(cmp_out16, best_candidate);

                    cmp_out[0] =
                        (CGU_UINT32)cmp_out16[0] + (CGU_UINT32)(cmp_out16[1] << 8) + (CGU_UINT32)(cmp_out16[2] << 16) + (CGU_UINT32)(cmp_out16[3] << 24);
                    cmp_out[1] =
                        (CGU_UINT32)cmp_out16[4] + (CGU_UINT32)(cmp_out16[5] << 8) + (CGU_UINT32)(cmp_out16[6] << 16) + (CGU_UINT32)(cmp_out16[7] << 24);
                    cmp_out[2] =
                        (CGU_UINT32)cmp_out16[8] + (CGU_UINT32)(cmp_out16[9] << 8) + (CGU_UINT32)(cmp_out16[10] << 16) + (CGU_UINT32)(cmp_out16[11] << 24);
                    cmp_out[3] =
                        (CGU_UINT32)cmp_out16[12] + (CGU_UINT32)(cmp_out16[13] << 8) + (CGU_UINT32)(cmp_out16[14] << 16) + (CGU_UINT32)(cmp_out16[15] << 24);
                }
            }
        }  // B
    }      // A
}

#endif
#endif

#ifdef ENABLE_CMP_REFINE_MODE6_API

CGU_BOOL get_ideal_cluster2(CMP_INOUT CGV_Vec4f image_cluster[2],
                            CMP_IN CGU_UINT32   index_cluster[16],
                            CMP_IN CGU_INT      Mi_,
                            CMP_IN CGU_Vec4ui   image_src[16],
                            CMP_IN CGU_UINT32   numEntries,
                            CMP_IN CGU_UINT32   channels3or4)
{
    // get ideal cluster centers
    CGV_Vec4f image_cluster_mean[16];

    for (CGU_UINT32 ii = 0; ii < 16; ii++)
    {
        image_cluster_mean[ii] = 0.0f;
    }

    GetClusterMean2(image_cluster_mean, image_src, index_cluster, numEntries, channels3or4);  // unrounded

    CGV_FLOAT image_matrix0[2] = {0, 0};  // matrix /inverse matrix
    CGV_FLOAT image_matrix1[2] = {0, 0};  // matrix /inverse matrix
    CGV_Vec4f image_rp[2];                // right part for RMS fit problem

    image_rp[0] = 0.0f;
    image_rp[1] = 0.0f;

    // weight with cnt if runnning on compacted index
    for (CGU_UINT32 k = 0; k < numEntries; k++)
    {
        image_matrix0[0] += (Mi_ - index_cluster[k]) * (Mi_ - index_cluster[k]);
        image_matrix0[1] += index_cluster[k] * (Mi_ - index_cluster[k]);  // im is symmetric
        image_matrix1[1] += index_cluster[k] * index_cluster[k];

        image_rp[0] += image_cluster_mean[index_cluster[k]] * (CGU_FLOAT)(Mi_ - index_cluster[k]);
        image_rp[1] += image_cluster_mean[index_cluster[k]] * (CGU_FLOAT)index_cluster[k];
    }

    CGV_FLOAT matrix_dd = image_matrix0[0] * image_matrix1[1] - image_matrix0[1] * image_matrix0[1];

    // assert(matrix_dd !=0);
    // matrix_dd=0 means that index_cidx[k] and (Mi_-index_cidx[k]) collinear which implies only one active index;
    // taken care of separately
    if (matrix_dd == 0)
    {
        image_cluster[0] = 0.0f;
        image_cluster[1] = 0.0f;
        return FALSE;
    }

    image_matrix1[0] = image_matrix0[0];
    image_matrix0[0] = image_matrix1[1] / matrix_dd;
    image_matrix1[1] = image_matrix1[0] / matrix_dd;

    image_matrix1[0] = image_matrix0[1] = -image_matrix0[1] / matrix_dd;

    CGV_FLOAT Mif = (CGV_FLOAT)Mi_;

    // values can exceed 255 here, clamp made no diff in quality!
    image_cluster[0] = (((image_rp[0] * image_matrix0[0]) + (image_rp[1] * image_matrix0[1])) * Mif);
    image_cluster[1] = (((image_rp[0] * image_matrix1[0]) + (image_rp[1] * image_matrix1[1])) * Mif);

    return TRUE;
}

CGV_FLOAT shake2(CMP_INOUT CGU_Vec4ui epo_code_shake[2],
                 CMP_IN CGV_Vec4f     image_cluster[2],
                 CMP_IN CGU_UINT32    index_cluster[16],
                 CMP_IN CGU_Vec4ui    image_src[16],
                 CMP_IN CGU_UINT32    index_bits,
                 CMP_IN CGU_UINT32    mtype,
                 CMP_IN CGU_UINT32    max_bits[4],
                 CMP_IN CGU_UINT32    use_par,
                 CMP_IN CGU_UINT32    numEntries,  // max 16
                 CMP_IN CGU_UINT32    channels3or4)
{
    CMP_UNUSED(mtype);
    CGV_FLOAT best_err = CMP_FLOAT_MAX;

#define SHAKESIZE1 1
#define SHAKESIZE2 2
    // shake single or                                   - cartesian
    // shake odd/odd and even/even or                    - same parity
    // shake odd/odd odd/even , even/odd and even/even   - bcc

    CGV_FLOAT  err_ed[2][2][4];
    CGU_UINT32 epo_code_par[2][2][2][4];

    for (CGU_UINT32 ch = 0; ch < channels3or4; ch++)
    {
        CGU_UINT32 ppA = 0;
        CGU_UINT32 ppB = 0;
        CGU_UINT32 rr  = (use_par ? 2 : 1);
        CGU_UINT32 epo_code_epi0[2];  // first/second, coord, begin rage end range
        CGU_UINT32 epo_code_epi1[2];  // first/second, coord, begin rage end range

        for (ppA = 0; ppA < rr; ppA++)
        {  // loop max =2
            for (ppB = 0; ppB < rr; ppB++)
            {  //loop  max =2

                // set default ranges
                switch (ch)
                {
                case 0:
                    epo_code_epi0[0] = epo_code_epi0[1] = cmp_ep_find_floor2(image_cluster[0].x, max_bits[0], use_par, ppA);
                    epo_code_epi1[0] = epo_code_epi1[1] = cmp_ep_find_floor2(image_cluster[1].x, max_bits[0], use_par, ppB);
                    break;
                case 1:
                    epo_code_epi0[0] = epo_code_epi0[1] = cmp_ep_find_floor2(image_cluster[0].y, max_bits[1], use_par, ppA);
                    epo_code_epi1[0] = epo_code_epi1[1] = cmp_ep_find_floor2(image_cluster[1].y, max_bits[1], use_par, ppB);
                    break;
                case 2:
                    epo_code_epi0[0] = epo_code_epi0[1] = cmp_ep_find_floor2(image_cluster[0].z, max_bits[2], use_par, ppA);
                    epo_code_epi1[0] = epo_code_epi1[1] = cmp_ep_find_floor2(image_cluster[1].z, max_bits[2], use_par, ppB);
                    break;
                case 3:
                    if (channels3or4 == 4)
                    {
                        epo_code_epi0[0] = epo_code_epi0[1] = cmp_ep_find_floor2(image_cluster[0].w, max_bits[3], use_par, ppA);
                        epo_code_epi1[0] = epo_code_epi1[1] = cmp_ep_find_floor2(image_cluster[1].w, max_bits[3], use_par, ppB);
                    }
                    break;
                }

                // set begin range
                epo_code_epi0[0] -= ((epo_code_epi0[0] < SHAKESIZE1 ? epo_code_epi0[0] : SHAKESIZE1)) & (~use_par);
                epo_code_epi1[0] -= ((epo_code_epi1[0] < SHAKESIZE1 ? epo_code_epi1[0] : SHAKESIZE1)) & (~use_par);

                // set end range
                epo_code_epi0[1] +=
                    ((1 << max_bits[ch]) - 1 - epo_code_epi0[1] < SHAKESIZE2 ? (1 << max_bits[ch]) - 1 - epo_code_epi0[1] : SHAKESIZE2) & (~use_par);
                epo_code_epi1[1] +=
                    ((1 << max_bits[ch]) - 1 - epo_code_epi1[1] < SHAKESIZE2 ? (1 << max_bits[ch]) - 1 - epo_code_epi1[1] : SHAKESIZE2) & (~use_par);

                CGU_UINT32 step      = (1 << use_par);
                err_ed[ppA][ppB][ch] = CMP_FLOAT_MAX;

                for (CGU_UINT32 epo_p0 = epo_code_epi0[0]; epo_p0 <= epo_code_epi0[1]; epo_p0 += step)
                {
                    for (CGU_UINT32 epo_p1 = epo_code_epi1[0]; epo_p1 <= epo_code_epi1[1]; epo_p1 += step)
                    {
                        CGV_FLOAT image_square_diff = 0.0F;
                        CGV_FLOAT image_ramp;

                        for (CGU_UINT32 _mc = 1; _mc < numEntries; _mc++)
                        {
                            image_ramp = GetRamp2(epo_p0, epo_p1, index_cluster[_mc], index_bits);
                            switch (ch)
                            {
                            case 0:
                                image_square_diff += cmp_squaref(image_ramp - image_src[_mc].x);
                                break;
                            case 1:
                                image_square_diff += cmp_squaref(image_ramp - image_src[_mc].y);
                                break;
                            case 2:
                                image_square_diff += cmp_squaref(image_ramp - image_src[_mc].z);
                                break;
                            case 3:
                                if (channels3or4 == 4)
                                    image_square_diff += cmp_squaref(image_ramp - image_src[_mc].w);
                                break;
                            }
                        }

                        if (image_square_diff < err_ed[ppA][ppB][ch])
                        {
                            err_ed[ppA][ppB][ch]          = image_square_diff;
                            epo_code_par[ppA][ppB][0][ch] = epo_p0;
                            epo_code_par[ppA][ppB][1][ch] = epo_p1;
                        }
                    }
                }
            }  // pp1
        }      // pp0
    }          // j

    //---------------------------------------------------------
    // CMP_CONSTANT CGU_UINT8 npv_nd[2][8] = {
    //     {1, 2, 4, 8, 16, 32, 0, 0},  // 3 channel
    //     {1, 2, 4, 0, 0, 0, 0, 0}     // 4 channel  tyep index 0..7
    // };
    // for (CGU_INT pn = 0; pn < npv_nd[channels3or4 - 3][type]; pn++)
    CGU_UINT32 bits = 4;  // for mode 6 its 4
    for (CGU_UINT32 pn = 0; pn < bits; pn++)
    {
        CGV_FLOAT  err_2 = 0.0F;
        CGU_UINT32 d1    = 0;
        CGU_UINT32 d2    = 0;

        for (CGU_UINT32 ch = 0; ch < channels3or4; ch++)
        {
            d1 = par_vectors42_nd[pn][0][ch];
            d2 = par_vectors42_nd[pn][1][ch];
            err_2 += err_ed[d1][d2][ch];
        }

        if (err_2 < best_err)
        {
            best_err            = err_2;
            d1                  = par_vectors42_nd[pn][0][0];
            d2                  = par_vectors42_nd[pn][1][0];
            epo_code_shake[0].x = epo_code_par[d1][d2][0][0];
            epo_code_shake[1].x = epo_code_par[d1][d2][1][0];

            d1                  = par_vectors42_nd[pn][0][1];
            d2                  = par_vectors42_nd[pn][1][1];
            epo_code_shake[0].y = epo_code_par[d1][d2][0][1];
            epo_code_shake[1].y = epo_code_par[d1][d2][1][1];

            d1                  = par_vectors42_nd[pn][0][2];
            d2                  = par_vectors42_nd[pn][1][2];
            epo_code_shake[0].z = epo_code_par[d1][d2][0][2];
            epo_code_shake[1].z = epo_code_par[d1][d2][1][2];

            if (channels3or4 == 4)
            {
                d1                  = par_vectors42_nd[pn][0][3];
                d2                  = par_vectors42_nd[pn][1][3];
                epo_code_shake[0].w = epo_code_par[d1][d2][0][3];
                epo_code_shake[1].w = epo_code_par[d1][d2][1][3];
            }
        }
    }

    return best_err;
}

CGV_FLOAT requantized_image_err2(CMP_INOUT CGU_UINT32 index_best[16],
                                 CMP_IN CGU_Vec4ui    epo_code_best[2],
                                 CMP_IN CGU_UINT32    index_bits,
                                 CMP_IN CGU_UINT32    max_bits[4],
                                 CMP_IN CGU_Vec4ui    image_src[16],
                                 CMP_IN CGU_UINT32    numEntries,  // max 16
                                 CMP_IN CGU_UINT32    channels3or4)
{  // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)

    CMP_UNUSED(channels3or4);
    CMP_UNUSED(max_bits);

    //=========================================
    // requantized image based on new epo_code
    //=========================================
    CGV_Vec4f image_requantize[16];
    CGV_FLOAT err_requant = 0.0F;

    for (CGU_UINT32 k = 0; k < numEntries; k++)
    {
        image_requantize[k].x = GetRamp2(epo_code_best[0].x, epo_code_best[1].x, k, index_bits);
        image_requantize[k].y = GetRamp2(epo_code_best[0].y, epo_code_best[1].y, k, index_bits);
        image_requantize[k].z = GetRamp2(epo_code_best[0].z, epo_code_best[1].z, k, index_bits);
        image_requantize[k].w = GetRamp2(epo_code_best[0].w, epo_code_best[1].w, k, index_bits);
    }

    //=========================================
    // Calc the error for the requantized image
    //=========================================
    CGV_FLOAT  err_cmin;
    CGU_UINT32 best_indx;
    CGV_FLOAT  image_err;
    CGV_Vec4f  imageDiff;

    for (CGU_UINT32 k1 = 0; k1 < numEntries; k1++)
    {
        // start with error as sum of 4 channels with Max pixel
        // value 256 squared plus 1 for err min check = (256 * 256 * 4) + 1;
        err_cmin  = 262145.0f;
        best_indx = 0;

        for (CGU_UINT8 k2 = 0; k2 < numEntries; k2++)
        {
            image_err   = 0.0F;
            imageDiff.x = image_requantize[k2].x - image_src[k1].x;
            imageDiff.y = image_requantize[k2].y - image_src[k1].y;
            imageDiff.z = image_requantize[k2].z - image_src[k1].z;
            imageDiff.w = image_requantize[k2].w - image_src[k1].w;
            image_err   = cmp_dot4f(imageDiff, imageDiff);
            if (image_err < err_cmin)
            {
                err_cmin  = image_err;
                best_indx = k2;
            }
        }

        index_best[k1] = best_indx;
        err_requant += err_cmin;
    }

    return err_requant;
}

CGV_FLOAT cmp_mode6_optimize_IndexAndEndPoints(CMP_INOUT CGU_Vec4ui epo_code_out[2],  //
                                               CMP_INOUT CGU_UINT32 index_io[16],     // Make sure input index is 0..15 range
                                               CMP_IN CGU_Vec4ui    image_src[16],
                                               CMP_IN CGU_UINT32    numEntries,    // max 16
                                               CMP_IN CGU_UINT32    Mi_,           // last cluster , This should be no larger than 16
                                               CMP_IN CGU_UINT32    bits,          // total for all components
                                               CMP_IN CGU_UINT32    channels3or4,  // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
                                               CMP_IN CGU_FLOAT     errorThreshold)
{
    CMP_UNUSED(bits);
    CGV_FLOAT  err_best    = CMP_FLOAT_MAX;
    CGU_UINT32 type        = 2;             // = bits %  (2 * channels3or4) for Mode 6 with 58 bits and 4 channels type is 2
    CGU_UINT32 use_par     = 1;             // as type == 2 use par is 1 = (type != 0);
    CGU_UINT32 max_bits[4] = {8, 8, 8, 8};  // Mode 6 max bits is 8 = (bits + channels2 - 1) / channels2;
    CGU_UINT32 index_bits  = 4;             // channel bits !! = 4
                                            // CGU_INT   iv;
                                            // iv = Mi_;
                                            // while (iv >>= 1)
                                            //     index_bits++;

    Mi_ = Mi_ - 1;

    CGU_UINT32 index_tmp[16];
    CGU_UINT32 maxTry      = MAX_TRY_SHAKER;  // should be set by quality
    CGV_FLOAT  err_requant = 0.0F;

    // Init best index to input index
    for (CGU_UINT32 k = 0; k < numEntries; k++)
        index_tmp[k] = index_io[k];

    CGU_UINT32 MaxIndex;

    MaxIndex = index_collapse2(index_tmp, numEntries);

    // we have a solid color 4x4 block no need for optimization!
    if (MaxIndex == 0)
        return 0.0f;

    for (CGU_UINT32 ii = 0; ii < maxTry; ii++)
    {
        //===============================
        // We have ramp colors to process
        //===============================
        CGV_FLOAT  err_cluster = CMP_FLOAT_MAX;
        CGV_FLOAT  err_shake;
        CGU_UINT32 index_cluster[16];
        CGU_Vec4ui epo_code_best[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}};

        for (CGU_UINT32 ii2 = 0; ii2 < numEntries; ii2++)
            index_cluster[ii2] = 0;

        CGU_UINT32 mi = Mi_;

        for (CGU_UINT32 index_slope = 1; (index_slope * MaxIndex) <= mi; index_slope++)
        {
            CGV_Vec4f image_cluster[2] = {{0.0f, 0.0f, 0.0f, 0.0f}, {0.0f, 0.0f, 0.0f, 0.0f}};

            for (CGU_UINT32 index_offset = 0; index_offset <= (mi - index_slope * MaxIndex); index_offset++)
            {
                //-------------------------------------
                // set a new index data to try
                //-------------------------------------
                for (CGU_UINT32 k = 0; k < numEntries; k++)
                    index_cluster[k] = index_tmp[k] * index_slope + index_offset;

                if (get_ideal_cluster2(image_cluster, index_cluster, Mi_, image_src, numEntries, channels3or4))
                {
                    CGU_Vec4ui epo_code_shake[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}};
                    err_shake                    = shake2(epo_code_shake,  // return new epo
                                       image_cluster,
                                       index_cluster,
                                       image_src,
                                       index_bits,
                                       type,
                                       max_bits,
                                       use_par,
                                       numEntries,  // max 16
                                       channels3or4);

                    if (err_shake < err_cluster)
                    {
                        err_cluster      = err_shake;
                        epo_code_best[0] = epo_code_shake[0];
                        epo_code_best[1] = epo_code_shake[1];
                    }
                }
            }
        }

        if ((err_cluster != CMP_FLOAT_MAX))
        {
            //=========================
            // test results for quality
            //=========================
            CGU_UINT32 index_best[16] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
            err_requant               = requantized_image_err2(index_best,     // new index results
                                                 epo_code_best,  // prior result input
                                                 index_bits,
                                                 max_bits,
                                                 image_src,
                                                 numEntries,
                                                 channels3or4);
            if (err_requant < err_best)
            {
                //better = 1;
                for (CGU_UINT32 k = 0; k < numEntries; k++)
                    index_io[k] = index_tmp[k] = index_best[k];

                //cmp_pack4bitindex(index_packed_out, index_io);
                epo_code_out[0] = epo_code_best[0];
                epo_code_out[1] = epo_code_best[1];
                err_best        = err_requant;
            }
        }

        // Early out if we have our target err
        if (err_best <= errorThreshold)
            break;

        MaxIndex = index_collapse2(index_tmp, numEntries);
        if (MaxIndex == 0)
            break;
    }

    // Did not find anything better over Max trys
    return err_best;
}

#endif

#endif  // ENABLE_CMP_API : CPU & GPU Code block

//=================================================================================
// GPU API Interfaces
// mode 4 5 6 all have 1 subset per block, and fix-up index is always index 0
//=================================================================================
CMP_NUMTHREADS(THREAD_GROUP_SIZE, 1, 1) void TryMode456CS(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID)
{
    CMP_CONSTANT CGU_UINT32 MAX_USED_THREAD = 16;
    CGU_UINT32              BLOCK_IN_GROUP  = THREAD_GROUP_SIZE / MAX_USED_THREAD;
    CGU_UINT32              blockInGroup    = GI / MAX_USED_THREAD;
    CGU_UINT32              blockID         = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
    CGU_UINT32              threadBase      = blockInGroup * MAX_USED_THREAD;
    CGU_UINT32              threadInBlock   = GI - threadBase;

    CGU_UINT32 block_y = blockID / g_num_block_x;
    CGU_UINT32 block_x = blockID - block_y * g_num_block_x;
    CGU_UINT32 base_x  = block_x * BLOCK_SIZE_X;
    CGU_UINT32 base_y  = block_y * BLOCK_SIZE_Y;

#if (defined(ENABLE_MODE4) || defined(ENABLE_MODE5) || defined(ENABLE_MODE6) || defined(ENABLE_CMP_MODE6))

    if (threadInBlock < 16)
    {
        CGU_Vec4f px            = g_Input.Load(CGU_Vec3ui(base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0)) * 255.0f;
        px                      = clamp(px, 0.0f, 255.0f);
        shared_temp[GI].pixel.r = (CGU_UINT32)px.r;
        shared_temp[GI].pixel.g = (CGU_UINT32)px.g;
        shared_temp[GI].pixel.b = (CGU_UINT32)px.b;
        shared_temp[GI].pixel.a = (CGU_UINT32)px.a;

        shared_temp[GI].endPoint_low  = shared_temp[GI].pixel;
        shared_temp[GI].endPoint_high = shared_temp[GI].pixel;
    }
    GroupSync();

    if (threadInBlock < 8)
    {
        shared_temp[GI].endPoint_low  = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 8].endPoint_low);
        shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 8].endPoint_high);
    }
    GroupSync();

    if (threadInBlock < 4)
    {
        shared_temp[GI].endPoint_low  = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 4].endPoint_low);
        shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 4].endPoint_high);
    }
    GroupSync();

    if (threadInBlock < 2)
    {
        shared_temp[GI].endPoint_low  = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 2].endPoint_low);
        shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 2].endPoint_high);
    }
    GroupSync();

    if (threadInBlock < 1)
    {
        shared_temp[GI].endPoint_low  = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 1].endPoint_low);
        shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 1].endPoint_high);
    }
    GroupSync();

    CGU_Vec4ui endPoint[2];
    endPoint[0] = shared_temp[threadBase].endPoint_low;
    endPoint[1] = shared_temp[threadBase].endPoint_high;

    CGU_UINT32 error          = 0xFFFFFFFF;
    CGU_UINT32 mode           = 0;
    CGU_UINT32 index_selector = 0;
    CGU_UINT32 rotation       = 0;

    CGU_Vec2ui indexPrec;
    if (threadInBlock < 8)  // all threads of threadInBlock < 8 will be working on trying out mode 4, since only mode 4 has index selector bit
    {
        if (0 == (threadInBlock & 1))  // thread 0, 2, 4, 6
        {
            //2 represents 2bit index precision; 1 represents 3bit index precision
            index_selector = 0;
            indexPrec      = CGU_Vec2ui(2, 1);
        }
        else  // thread 1, 3, 5, 7
        {
            //2 represents 2bit index precision; 1 represents 3bit index precision
            index_selector = 1;
            indexPrec      = CGU_Vec2ui(1, 2);
        }
    }
    else
    {
        //2 represents 2bit index precision
        indexPrec = CGU_Vec2ui(2, 2);
    }

    CGU_Vec4ui pixel_r;
    CGU_UINT32 color_index;
    CGU_UINT32 alpha_index;
    CGU_Vec4i  span;
    CGU_Vec2i  span_norm_sqr;
    CGU_Vec2i  dotProduct;

#if defined(ENABLE_MODE4) || defined(ENABLE_MODE5)
    if (threadInBlock < 12)  // Try mode 4 5 in threads 0..11
    {
        CGU_Vec4ui ep_quantized[2];
        // mode 4 5 have component rotation
        if ((threadInBlock < 2) || (8 == threadInBlock))  // rotation = 0 in thread 0, 1
        {
            rotation = 0;
        }
        else if ((threadInBlock < 4) || (9 == threadInBlock))  // rotation = 1 in thread 2, 3
        {
            rotation = 1;
            set_pixel_rotation(endPoint[0], rotation);
            set_pixel_rotation(endPoint[1], rotation);
        }
        else if ((threadInBlock < 6) || (10 == threadInBlock))  // rotation = 2 in thread 4, 5
        {
            rotation = 2;
            set_pixel_rotation(endPoint[0], rotation);
            set_pixel_rotation(endPoint[1], rotation);
        }
        else if ((threadInBlock < 8) || (11 == threadInBlock))  // rotation = 3 in thread 6, 7
        {
            rotation = 3;
            set_pixel_rotation(endPoint[0], rotation);
            set_pixel_rotation(endPoint[1], rotation);
        }

        if (threadInBlock < 8)  // try mode 4 in threads 0..7
        {
            // mode 4 thread distribution
            // Thread           0   1   2   3   4   5   6   7
            // Rotation         0   0   1   1   2   2   3   3
            // Index selector   0   1   0   1   0   1   0   1

            mode = 4;
            compress_endpoints4(endPoint, ep_quantized);
        }
        else  // try mode 5 in threads 8..11
        {
            // mode 5 thread distribution
            // Thread    8  9  10  11
            // Rotation  0  1   2   3

            mode = 5;
            compress_endpoints5(endPoint, ep_quantized);
        }

        CGU_Vec4ui pixel = shared_temp[threadBase + 0].pixel;
        set_pixel_rotation(pixel, rotation);

        span          = cmp_castimp(endPoint[1] - endPoint[0]);
        span_norm_sqr = CGU_Vec2i(dot(span.rgb, span.rgb), span.a * span.a);

        // should be the same as above
        CGU_Vec3ui diff0 = pixel.rgb - endPoint[0].rgb;
        CGU_Vec3ui diff1 = pixel.rgb - endPoint[1].rgb;
        dotProduct       = CGU_Vec2i(dot(diff0, diff0), dot(diff1, diff1));

        if (dotProduct.x > dotProduct.y)
        {
            span.rgb.x = -span.rgb.x;
            span.rgb.y = -span.rgb.y;
            span.rgb.z = -span.rgb.z;
            swap(endPoint[0].rgb, endPoint[1].rgb);
        }

        CGU_UINT32 diffa0 = pixel.a - endPoint[0].a;
        CGU_UINT32 diffa1 = pixel.a - endPoint[1].a;
        dotProduct        = CGU_Vec2i(dot(diffa0, diffa0), dot(diffa1, diffa1));
        if (dotProduct.x > dotProduct.y)
        {
            span.a = -span.a;
            swap(endPoint[0].a, endPoint[1].a);
        }

        error = 0;
        for (CGU_UINT32 i = 0; i < 16; i++)
        {
            pixel = shared_temp[threadBase + i].pixel;
            set_pixel_rotation(pixel, rotation);

            diff0 = pixel.rgb - endPoint[0].rgb;

            dotProduct.x = dot(span.rgb, diff0);
            color_index =
                (span_norm_sqr.x <= 0 /*endPoint[0] == endPoint[1]*/ || dotProduct.x <= 0 /*pixel == endPoint[0]*/)
                    ? 0
                    : ((dotProduct.x < span_norm_sqr.x) ? aStep[indexPrec.x][CGU_UINT32(dotProduct.x * 63.49999 / span_norm_sqr.x)] : aStep[indexPrec.x][63]);

            diffa0       = pixel.a - endPoint[0].a;
            dotProduct.y = dot(span.a, diffa0);
            alpha_index =
                (span_norm_sqr.y <= 0 || dotProduct.y <= 0)
                    ? 0
                    : ((dotProduct.y < span_norm_sqr.y) ? aStep[indexPrec.y][CGU_UINT32(dotProduct.y * 63.49999 / span_norm_sqr.y)] : aStep[indexPrec.y][63]);

            pixel_r.rgb = (endPoint[0].rgb * (64 - aWeight[indexPrec.x][color_index]) + endPoint[1].rgb * aWeight[indexPrec.x][color_index] + 32U);

            pixel_r.rgb.x = pixel_r.rgb.x >> 6;
            pixel_r.rgb.y = pixel_r.rgb.y >> 6;
            pixel_r.rgb.z = pixel_r.rgb.z >> 6;

            pixel_r.a = (endPoint[0].a * (64 - aWeight[indexPrec.y][alpha_index]) + endPoint[1].a * aWeight[indexPrec.y][alpha_index] + 32) >> 6;

            Ensure_A_Is_Larger(pixel_r, pixel);

            pixel_r -= pixel;
            set_pixel_rotation(pixel_r, rotation);
            error += ComputeError(pixel_r, pixel_r);
        }
    }
    else
#endif
#ifdef ENABLE_MODE6
        if (threadInBlock < 16)  // Try mode 6 in threads 12..15, since in mode 4 5 6, only mode 6 has p bit
    {
        CGU_UINT32 p = threadInBlock - 12;
        CGU_Vec4ui ep_quantized[2];

        compress_endpoints6(endPoint, ep_quantized, CGU_Vec2ui(p & 1, (p >> 1) & 1));

        CGU_Vec4ui pixel = shared_temp[threadBase + 0].pixel;

        span          = cmp_castimp(endPoint[1] - endPoint[0]);
        span_norm_sqr = dot(span, span);

        CGU_Vec4ui diff4 = pixel - endPoint[0];
        dotProduct       = dot(span, diff4);
        if (span_norm_sqr.x > 0 && dotProduct.x >= 0 && CGU_UINT32(dotProduct.x * 63.49999) > CGU_UINT32(32 * span_norm_sqr.x))
        {
            span = -span;
            swap(endPoint[0], endPoint[1]);
        }

        error = 0;
        for (CGU_UINT32 i = 0; i < 16; i++)
        {
            pixel        = shared_temp[threadBase + i].pixel;
            diff4        = pixel - endPoint[0];
            dotProduct.x = dot(span, diff4);
            color_index  = (span_norm_sqr.x <= 0 || dotProduct.x <= 0)
                               ? 0
                               : ((dotProduct.x < span_norm_sqr.x) ? aStep[0][CGU_UINT32(dotProduct.x * 63.49999 / span_norm_sqr.x)] : aStep[0][63]);

            pixel_r = (endPoint[0] * (64 - aWeight[0][color_index]) + endPoint[1] * aWeight[0][color_index] + 32U) >> 6;

            Ensure_A_Is_Larger(pixel_r, pixel);
            pixel_r -= pixel;
            error += ComputeError(pixel_r, pixel_r);
        }

        mode     = 6;
        rotation = p;  // Borrow rotation for p
    }
#endif

    shared_temp[GI].error          = error;
    shared_temp[GI].mode           = mode;
    shared_temp[GI].index_selector = index_selector;
    shared_temp[GI].rotation       = rotation;
    GroupSync();

    if (threadInBlock < 8)
    {
        if (shared_temp[GI].error > shared_temp[GI + 8].error)
        {
            shared_temp[GI].error          = shared_temp[GI + 8].error;
            shared_temp[GI].mode           = shared_temp[GI + 8].mode;
            shared_temp[GI].index_selector = shared_temp[GI + 8].index_selector;
            shared_temp[GI].rotation       = shared_temp[GI + 8].rotation;
        }
    }
    GroupSync();

    if (threadInBlock < 4)
    {
        if (shared_temp[GI].error > shared_temp[GI + 4].error)
        {
            shared_temp[GI].error          = shared_temp[GI + 4].error;
            shared_temp[GI].mode           = shared_temp[GI + 4].mode;
            shared_temp[GI].index_selector = shared_temp[GI + 4].index_selector;
            shared_temp[GI].rotation       = shared_temp[GI + 4].rotation;
        }
    }
    GroupSync();

    if (threadInBlock < 2)
    {
        if (shared_temp[GI].error > shared_temp[GI + 2].error)
        {
            shared_temp[GI].error          = shared_temp[GI + 2].error;
            shared_temp[GI].mode           = shared_temp[GI + 2].mode;
            shared_temp[GI].index_selector = shared_temp[GI + 2].index_selector;
            shared_temp[GI].rotation       = shared_temp[GI + 2].rotation;
        }
    }
    GroupSync();

    if (threadInBlock < 1)
    {
        if (shared_temp[GI].error > shared_temp[GI + 1].error)
        {
            shared_temp[GI].error          = shared_temp[GI + 1].error;
            shared_temp[GI].mode           = shared_temp[GI + 1].mode;
            shared_temp[GI].index_selector = shared_temp[GI + 1].index_selector;
            shared_temp[GI].rotation       = shared_temp[GI + 1].rotation;
        }

        // Save the fast mode settings for modes 4&5 check if q = 0 for mode 6)
        g_OutBuff1[blockID].error          = shared_temp[GI].error;
        g_OutBuff1[blockID].mode           = shared_temp[GI].mode & 0x07;
        g_OutBuff1[blockID].rotation       = shared_temp[GI].rotation;
        g_OutBuff1[blockID].index_selector = shared_temp[GI].index_selector;
        g_OutBuff1[blockID].partition      = 0;
        g_OutBuff1[blockID].data2          = 0;

        // Enable cmp test
#ifdef ENABLE_CMP_MODE6
        if ((g_quality > 0.05f)
#ifdef ENABLE_MODE6
            && (shared_temp[GI].mode == 6)
#endif
        )
        {
            CGU_Vec4ui image_src[16];
            for (int i = 0; i < 16; i++)
            {
                image_src[i].x = shared_temp[threadBase + i].pixel.x;
                image_src[i].y = shared_temp[threadBase + i].pixel.y;
                image_src[i].z = shared_temp[threadBase + i].pixel.z;
                image_src[i].w = shared_temp[threadBase + i].pixel.w;
            }

            CGU_Vec4ui epo_code_out[2]     = {{0, 0, 0, 0}, {0, 0, 0, 0}};
            CGU_UINT32 index_packed_out[2] = {0, 0};
            CGU_UINT32 cmp_out6[4]         = {0, 0, 0, 0};
            CGU_UINT32 best_index_out[16];

            CGU_UINT32 besterr = cmp_GetIndexedEndPoints(epo_code_out,
                                                         best_index_out,
                                                         image_src,
                                                         15,  // numEntries 0..15 (Note this function is changed from using 16)
                                                         0xffffffff);

            // Error cal needs updating to be the same all over
            //if (besterr > shared_temp[GI].error)
            {
                cmp_pack4bitindex32(index_packed_out, best_index_out);

#ifdef ENABLE_CMP_REFINE_MODE6_API
                if (g_quality > 0.5f)
                {
                    // Refined for better quailty using prior best_index_out initial input
                    besterr = cmp_mode6_optimize_IndexAndEndPoints(epo_code_out,
                                                                   best_index_out,
                                                                   image_src,
                                                                   16,                              // numEntries
                                                                   g_modesettings[6].clusters,      // 16,
                                                                   g_modesettings[6].bits,          // 58,
                                                                   g_modesettings[6].channels3or4,  // 4,
                                                                   0.1f);

                    cmp_pack4bitindex32(index_packed_out, best_index_out);
                }
#endif

                cmp_encode_mode6(cmp_out6, epo_code_out, index_packed_out);

                // Addin CMP results
                g_OutBuff1[blockID].error   = besterr;
                g_OutBuff1[blockID].mode    = 6 | 0x10;
                g_OutBuff1[blockID].data2.x = cmp_out6[0];
                g_OutBuff1[blockID].data2.y = cmp_out6[1];
                g_OutBuff1[blockID].data2.z = cmp_out6[2];
                g_OutBuff1[blockID].data2.w = cmp_out6[3];

            }  // if better then fast mode
        }
#endif
    }

#else
    // Init
    if (threadInBlock < 1)
    {
        g_OutBuff1[blockID].error          = MAX_UINT;
        g_OutBuff1[blockID].mode           = 0;
        g_OutBuff1[blockID].rotation       = 0;
        g_OutBuff1[blockID].index_selector = 0;
        g_OutBuff1[blockID].partition      = 0;
        g_OutBuff1[blockID].data2          = 0;
    }
    GroupSync();
#endif
}

CMP_NUMTHREADS(THREAD_GROUP_SIZE, 1, 1)
void TryMode137CS(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID)  // mode 1 3 7 all have 2 subsets per block
{
    const CGU_UINT32 MAX_USED_THREAD = 64;
    CGU_UINT32       BLOCK_IN_GROUP  = THREAD_GROUP_SIZE / MAX_USED_THREAD;
    CGU_UINT32       blockInGroup    = GI / MAX_USED_THREAD;
    CGU_UINT32       blockID         = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
    CGU_UINT32       threadBase      = blockInGroup * MAX_USED_THREAD;
    CGU_UINT32       threadInBlock   = GI - threadBase;

    CGU_UINT32 block_y = blockID / g_num_block_x;
    CGU_UINT32 block_x = blockID - block_y * g_num_block_x;
    CGU_UINT32 base_x  = block_x * BLOCK_SIZE_X;
    CGU_UINT32 base_y  = block_y * BLOCK_SIZE_Y;

    if (threadInBlock < 16)
    {
        CGU_Vec4f px            = g_Input.Load(CGU_Vec3ui(base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0)) * 255.0f;
        px                      = clamp(px, 0.0f, 255.0f);
        shared_temp[GI].pixel.r = (CGU_UINT32)px.r;
        shared_temp[GI].pixel.g = (CGU_UINT32)px.g;
        shared_temp[GI].pixel.b = (CGU_UINT32)px.b;
        shared_temp[GI].pixel.a = (CGU_UINT32)px.a;
    }
    GroupSync();

    shared_temp[GI].error = 0xFFFFFFFF;

    // Use this to test only one of modes 1,3, or 7
    // if (g_mode_id != 7) {
    //     if (threadInBlock == 0)
    //         g_OutBuff1[blockID].error           = g_InBuff[blockID].error;
    //         g_OutBuff1[blockID].mode            = g_InBuff[blockID].mode;
    //         g_OutBuff1[blockID].partition       = g_InBuff[blockID].partition;
    //         g_OutBuff1[blockID].index_selector  = g_InBuff[blockID].index_selector;
    //         g_OutBuff1[blockID].rotation        = g_InBuff[blockID].rotation;
    //         g_OutBuff1[blockID].data2           = g_InBuff[blockID].data2;
    //      return;
    // }

#if defined(ENABLE_MODE1) || defined(ENABLE_MODE3) || defined(ENABLE_MODE7)
    CGU_Vec4ui pixel_r;
    CGU_Vec4ui endPoint[2][2];  // endPoint[0..1 for subset id][0..1 for low and high in the subset]
    CGU_Vec4ui endPointBackup[2][2];
    CGU_UINT32 color_index;
    if (threadInBlock < 64)
    {
        CGU_UINT32 partition = threadInBlock;
        CGU_UINT32 i;

        endPoint[0][0]  = MAX_UINT;
        endPoint[0][1]  = MIN_UINT;
        endPoint[1][0]  = MAX_UINT;
        endPoint[1][1]  = MIN_UINT;
        CGU_UINT32 bits = blockPartitions[partition];
        for (i = 0; i < 16; i++)
        {
            CGU_Vec4ui pixel = shared_temp[threadBase + i].pixel;
            if (((bits >> i) & 0x01) == 1)
            {
                endPoint[1][0] = cmp_min(endPoint[1][0], pixel);
                endPoint[1][1] = cmp_max(endPoint[1][1], pixel);
            }
            else
            {
                endPoint[0][0] = cmp_min(endPoint[0][0], pixel);
                endPoint[0][1] = cmp_max(endPoint[0][1], pixel);
            }
        }

        endPointBackup[0][0] = endPoint[0][0];
        endPointBackup[0][1] = endPoint[0][1];
        endPointBackup[1][0] = endPoint[1][0];
        endPointBackup[1][1] = endPoint[1][1];

        CGU_UINT32 max_p = 2;  // mode 1

#if defined(ENABLE_MODE3) || defined(ENABLE_MODE7)
        if (g_mode_id != 1)
        {
            // in mode 3 7, there are two p bits per subset, one for each end point
            max_p = 4;
        }
#endif
        CGU_UINT32 final_p[2] = {0, 0};
        CGU_UINT32 error[2]   = {MAX_UINT, MAX_UINT};
        for (CGU_UINT32 p = 0; p < max_p; p++)
        {
            endPoint[0][0] = endPointBackup[0][0];
            endPoint[0][1] = endPointBackup[0][1];
            endPoint[1][0] = endPointBackup[1][0];
            endPoint[1][1] = endPointBackup[1][1];

            for (i = 0; i < 2; i++)  // loop through 2 subsets
            {
#if defined(ENABLE_MODE1)
                if (g_mode_id == 1)
                {
                    CGU_Vec4ui quantized[2];

                    compress_endpoints1(endPoint[i], quantized, p);
                }
#endif
#if defined(ENABLE_MODE3)
                if (g_mode_id == 3)
                {
                    CGU_Vec4ui quantized[2];

                    compress_endpoints3(endPoint[i], quantized, CGU_Vec2ui(p & 1, (p >> 1) & 1));
                }
#endif
#if defined(ENABLE_MODE7)
                if (g_mode_id == 7)
                {
                    CGU_Vec4ui quantized[2];
                    compress_endpoints7(endPoint[i], quantized, CGU_Vec2ui(p & 1, (p >> 1) & 1));
                }
#endif
            }

            CGU_Vec4i span[2];
            span[0].x = endPoint[0][1].x - endPoint[0][0].x;
            span[0].y = endPoint[0][1].y - endPoint[0][0].y;
            span[0].z = endPoint[0][1].z - endPoint[0][0].z;
            span[0].w = endPoint[0][1].w - endPoint[0][0].w;
            span[1].x = endPoint[1][1].x - endPoint[1][0].x;
            span[1].y = endPoint[1][1].y - endPoint[1][0].y;
            span[1].z = endPoint[1][1].z - endPoint[1][0].z;
            span[1].w = endPoint[1][1].w - endPoint[1][0].w;

#if defined(ENABLE_MODE3)
            if (g_mode_id != 7)
            {
                span[0].w = span[1].w = 0;
            }
#endif
            CGU_INT span_norm_sqr[2];
            span_norm_sqr[0] = dot(span[0], span[0]);
            span_norm_sqr[1] = dot(span[1], span[1]);

            CGU_Vec4i diff;
            diff.x = shared_temp[threadBase + 0].pixel.x - endPoint[0][0].x;
            diff.y = shared_temp[threadBase + 0].pixel.y - endPoint[0][0].y;
            diff.z = shared_temp[threadBase + 0].pixel.z - endPoint[0][0].z;
            diff.w = shared_temp[threadBase + 0].pixel.w - endPoint[0][0].w;

            // TODO: again, this shouldn't be necessary here in error calculation
            CGU_INT dotProduct = dot(span[0], diff);
            if (span_norm_sqr[0] > 0 && dotProduct > 0 && CGU_UINT32(dotProduct * 63.49999) > CGU_UINT32(32 * span_norm_sqr[0]))
            {
                span[0].x = -span[0].x;
                span[0].y = -span[0].y;
                span[0].z = -span[0].z;
                span[0].w = -span[0].w;
                swap(endPoint[0][0], endPoint[0][1]);
            }

            diff.x = shared_temp[threadBase + candidateFixUpIndex1D[partition].x].pixel.x - endPoint[1][0].x;
            diff.y = shared_temp[threadBase + candidateFixUpIndex1D[partition].x].pixel.y - endPoint[1][0].y;
            diff.z = shared_temp[threadBase + candidateFixUpIndex1D[partition].x].pixel.z - endPoint[1][0].z;
            diff.w = shared_temp[threadBase + candidateFixUpIndex1D[partition].x].pixel.w - endPoint[1][0].w;

            dotProduct = dot(span[1], diff);
            if (span_norm_sqr[1] > 0 && dotProduct > 0 && CGU_UINT32(dotProduct * 63.49999) > CGU_UINT32(32 * span_norm_sqr[1]))
            {
                span[1].x = -span[1].x;
                span[1].y = -span[1].y;
                span[1].z = -span[1].z;
                span[1].w = -span[1].w;
                swap(endPoint[1][0], endPoint[1][1]);
            }

            CGU_UINT32 step_selector = 1;  // mode 1 has 3 bit index

#if defined(ENABLE_MODE3) || defined(ENABLE_MODE7)
            if (g_mode_id != 1)
            {
                step_selector = 2;  // mode 3 7 have 2 bit index
            }
#endif

            CGU_UINT32 p_error[2] = {0, 0};
            for (i = 0; i < 16; i++)
            {
                CGU_UINT32 subset_index = (bits >> i) & 0x01;

                if (subset_index == 1)
                {
                    diff.x = shared_temp[threadBase + i].pixel.x - endPoint[1][0].x;
                    diff.y = shared_temp[threadBase + i].pixel.y - endPoint[1][0].y;
                    diff.z = shared_temp[threadBase + i].pixel.z - endPoint[1][0].z;
                    diff.w = shared_temp[threadBase + i].pixel.w - endPoint[1][0].w;

                    dotProduct  = dot(span[1], diff);
                    color_index = (span_norm_sqr[1] <= 0 || dotProduct <= 0)
                                      ? 0
                                      : ((dotProduct < span_norm_sqr[1]) ? aStep[step_selector][CGU_UINT32(dotProduct * 63.49999 / span_norm_sqr[1])]
                                                                         : aStep[step_selector][63]);
                }
                else
                {
                    diff.x      = shared_temp[threadBase + i].pixel.x - endPoint[0][0].x;
                    diff.y      = shared_temp[threadBase + i].pixel.y - endPoint[0][0].y;
                    diff.z      = shared_temp[threadBase + i].pixel.z - endPoint[0][0].z;
                    diff.w      = shared_temp[threadBase + i].pixel.w - endPoint[0][0].w;
                    dotProduct  = dot(span[0], diff);
                    color_index = (span_norm_sqr[0] <= 0 || dotProduct <= 0)
                                      ? 0
                                      : ((dotProduct < span_norm_sqr[0]) ? aStep[step_selector][CGU_UINT32(dotProduct * 63.49999 / span_norm_sqr[0])]
                                                                         : aStep[step_selector][63]);
                }

                pixel_r = (endPoint[subset_index][0] * (64 - aWeight[step_selector][color_index]) +
                           endPoint[subset_index][1] * aWeight[step_selector][color_index] + 32U) >>
                          6;
                if (g_mode_id != 7)
                {
                    pixel_r.a = 255;
                }

                CGU_Vec4ui pixel = shared_temp[threadBase + i].pixel;
                Ensure_A_Is_Larger(pixel_r, pixel);
                pixel_r -= pixel;
                CGU_UINT32 pixel_error = ComputeError(pixel_r, pixel_r);
                if (subset_index == 1)
                    p_error[1] += pixel_error;
                else
                    p_error[0] += pixel_error;
            }

            for (i = 0; i < 2; i++)
            {
                if (p_error[i] < error[i])
                {
                    error[i]   = p_error[i];
                    final_p[i] = p;
                }
            }
        }

        shared_temp[GI].error     = error[0] + error[1];
        shared_temp[GI].mode      = g_mode_id;
        shared_temp[GI].partition = partition;

        // mode 1 3 7 don't have rotation, we use rotation for p bits
        if (g_mode_id == 1)
            shared_temp[GI].rotation = (final_p[1] << 1) | final_p[0];
        else
            shared_temp[GI].rotation = (final_p[1] << 2) | final_p[0];
    }
    GroupSync();

    if (threadInBlock < 32)
    {
        if (shared_temp[GI].error > shared_temp[GI + 32].error)
        {
            shared_temp[GI].error     = shared_temp[GI + 32].error;
            shared_temp[GI].mode      = shared_temp[GI + 32].mode;
            shared_temp[GI].partition = shared_temp[GI + 32].partition;
            shared_temp[GI].rotation  = shared_temp[GI + 32].rotation;
        }
    }
    GroupSync();
    if (threadInBlock < 16)
    {
        if (shared_temp[GI].error > shared_temp[GI + 16].error)
        {
            shared_temp[GI].error     = shared_temp[GI + 16].error;
            shared_temp[GI].mode      = shared_temp[GI + 16].mode;
            shared_temp[GI].partition = shared_temp[GI + 16].partition;
            shared_temp[GI].rotation  = shared_temp[GI + 16].rotation;
        }
    }
    GroupSync();
    if (threadInBlock < 8)
    {
        if (shared_temp[GI].error > shared_temp[GI + 8].error)
        {
            shared_temp[GI].error     = shared_temp[GI + 8].error;
            shared_temp[GI].mode      = shared_temp[GI + 8].mode;
            shared_temp[GI].partition = shared_temp[GI + 8].partition;
            shared_temp[GI].rotation  = shared_temp[GI + 8].rotation;
        }
    }
    GroupSync();
    if (threadInBlock < 4)
    {
        if (shared_temp[GI].error > shared_temp[GI + 4].error)
        {
            shared_temp[GI].error     = shared_temp[GI + 4].error;
            shared_temp[GI].mode      = shared_temp[GI + 4].mode;
            shared_temp[GI].partition = shared_temp[GI + 4].partition;
            shared_temp[GI].rotation  = shared_temp[GI + 4].rotation;
        }
    }
    GroupSync();
    if (threadInBlock < 2)
    {
        if (shared_temp[GI].error > shared_temp[GI + 2].error)
        {
            shared_temp[GI].error     = shared_temp[GI + 2].error;
            shared_temp[GI].mode      = shared_temp[GI + 2].mode;
            shared_temp[GI].partition = shared_temp[GI + 2].partition;
            shared_temp[GI].rotation  = shared_temp[GI + 2].rotation;
        }
    }
    GroupSync();
    if (threadInBlock < 1)
    {
        if (shared_temp[GI].error > shared_temp[GI + 1].error)
        {
            shared_temp[GI].error     = shared_temp[GI + 1].error;
            shared_temp[GI].mode      = shared_temp[GI + 1].mode;
            shared_temp[GI].partition = shared_temp[GI + 1].partition;
            shared_temp[GI].rotation  = shared_temp[GI + 1].rotation;
        }

        if ((g_InBuff[blockID].error > shared_temp[GI].error))
        {
            g_OutBuff1[blockID].error          = shared_temp[GI].error;
            g_OutBuff1[blockID].mode           = shared_temp[GI].mode;
            g_OutBuff1[blockID].partition      = shared_temp[GI].partition;
            g_OutBuff1[blockID].rotation       = shared_temp[GI].rotation;
            g_OutBuff1[blockID].index_selector = 0;
            g_OutBuff1[blockID].data2          = 0;
        }
        else
        {
            g_OutBuff1[blockID].error          = g_InBuff[blockID].error;
            g_OutBuff1[blockID].mode           = g_InBuff[blockID].mode;
            g_OutBuff1[blockID].partition      = g_InBuff[blockID].partition;
            g_OutBuff1[blockID].index_selector = g_InBuff[blockID].index_selector;
            g_OutBuff1[blockID].rotation       = g_InBuff[blockID].rotation;
            g_OutBuff1[blockID].data2          = g_InBuff[blockID].data2;
        }
    }
#else
    GroupSync();
    if (threadInBlock < 1)
    {
        // cary over prior results
        g_OutBuff1[blockID].error          = g_InBuff[blockID].error;
        g_OutBuff1[blockID].mode           = g_InBuff[blockID].mode;
        g_OutBuff1[blockID].partition      = g_InBuff[blockID].partition;
        g_OutBuff1[blockID].index_selector = g_InBuff[blockID].index_selector;
        g_OutBuff1[blockID].rotation       = g_InBuff[blockID].rotation;
        g_OutBuff1[blockID].data2          = g_InBuff[blockID].data2;
    }
#endif
}

CMP_NUMTHREADS(THREAD_GROUP_SIZE, 1, 1) void TryMode02CS(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID)  // mode 0 2 have 3 subsets per block
{
    const CGU_UINT32 MAX_USED_THREAD = 64;
    CGU_UINT32       BLOCK_IN_GROUP  = THREAD_GROUP_SIZE / MAX_USED_THREAD;
    CGU_UINT32       blockInGroup    = GI / MAX_USED_THREAD;
    CGU_UINT32       blockID         = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
    CGU_UINT32       threadBase      = blockInGroup * MAX_USED_THREAD;
    CGU_UINT32       threadInBlock   = GI - threadBase;

    CGU_UINT32 block_y = blockID / g_num_block_x;
    CGU_UINT32 block_x = blockID - block_y * g_num_block_x;
    CGU_UINT32 base_x  = block_x * BLOCK_SIZE_X;
    CGU_UINT32 base_y  = block_y * BLOCK_SIZE_Y;

#if defined(ENABLE_MODE0) || defined(ENABLE_MODE2)
    if (threadInBlock < 16)
    {
        CGU_Vec4f px            = g_Input.Load(CGU_Vec3ui(base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0)) * 255.0f;
        px                      = clamp(px, 0.0f, 255.0f);
        shared_temp[GI].pixel.r = (CGU_UINT32)px.r;
        shared_temp[GI].pixel.g = (CGU_UINT32)px.g;
        shared_temp[GI].pixel.b = (CGU_UINT32)px.b;
        shared_temp[GI].pixel.a = (CGU_UINT32)px.a;
    }
    GroupSync();

    shared_temp[GI].error = 0xFFFFFFFF;

    CGU_UINT32 num_partitions;
    if (0 == g_mode_id)
    {
        num_partitions = 16;
    }
    else
    {
        num_partitions = 64;
    }

    CGU_Vec4ui pixel_r;
    CGU_Vec4ui endPoint[3][2];  // endPoint[0..1 for subset id][0..1 for low and high in the subset]
    CGU_Vec4ui endPointBackup[3][2];
    CGU_UINT32 color_index[16];

    if (threadInBlock < num_partitions)
    {
        CGU_UINT32 partition = threadInBlock + 64;

        endPoint[0][0]   = MAX_UINT;
        endPoint[0][1]   = MIN_UINT;
        endPoint[1][0]   = MAX_UINT;
        endPoint[1][1]   = MIN_UINT;
        endPoint[2][0]   = MAX_UINT;
        endPoint[2][1]   = MIN_UINT;
        CGU_UINT32 bits2 = blockPartitions2[partition - 64];
        CGU_UINT32 i;
        for (i = 0; i < 16; i++)
        {
            CGU_Vec4ui pixel        = shared_temp[threadBase + i].pixel;
            CGU_UINT32 subset_index = (bits2 >> (i * 2)) & 0x03;
            if (subset_index == 2)
            {
                endPoint[2][0] = cmp_min(endPoint[2][0], pixel);
                endPoint[2][1] = cmp_max(endPoint[2][1], pixel);
            }
            else if (subset_index == 1)
            {
                endPoint[1][0] = cmp_min(endPoint[1][0], pixel);
                endPoint[1][1] = cmp_max(endPoint[1][1], pixel);
            }
            else
            {
                endPoint[0][0] = cmp_min(endPoint[0][0], pixel);
                endPoint[0][1] = cmp_max(endPoint[0][1], pixel);
            }
        }

        endPointBackup[0][0] = endPoint[0][0];
        endPointBackup[0][1] = endPoint[0][1];
        endPointBackup[1][0] = endPoint[1][0];
        endPointBackup[1][1] = endPoint[1][1];
        endPointBackup[2][0] = endPoint[2][0];
        endPointBackup[2][1] = endPoint[2][1];

        CGU_UINT32 max_p;
        if (0 == g_mode_id)
        {
            max_p = 4;
        }
        else
        {
            max_p = 1;
        }

        CGU_UINT32 final_p[3] = {0, 0, 0};
        CGU_UINT32 error[3]   = {MAX_UINT, MAX_UINT, MAX_UINT};
        CGU_Vec4ui ep_quantized[2];
        for (CGU_UINT32 p = 0; p < max_p; p++)
        {
            endPoint[0][0] = endPointBackup[0][0];
            endPoint[0][1] = endPointBackup[0][1];
            endPoint[1][0] = endPointBackup[1][0];
            endPoint[1][1] = endPointBackup[1][1];
            endPoint[2][0] = endPointBackup[2][0];
            endPoint[2][1] = endPointBackup[2][1];

            for (i = 0; i < 3; i++)
            {
                if (0 == g_mode_id)
                {
                    compress_endpoints0(endPoint[i], ep_quantized, CGU_Vec2ui(p & 1, (p >> 1) & 1));
                }
                else
                {
                    compress_endpoints2(endPoint[i], ep_quantized);
                }
            }

            CGU_UINT32 step_selector = 1 + (2 == g_mode_id);

            CGU_Vec4i span[3];
            span[0]   = cmp_castimp(endPoint[0][1] - endPoint[0][0]);
            span[1]   = cmp_castimp(endPoint[1][1] - endPoint[1][0]);
            span[2]   = cmp_castimp(endPoint[2][1] - endPoint[2][0]);
            span[0].w = span[1].w = span[2].w = 0;

            CGU_INT span_norm_sqr[3];
            span_norm_sqr[0] = dot(span[0], span[0]);
            span_norm_sqr[1] = dot(span[1], span[1]);
            span_norm_sqr[2] = dot(span[2], span[2]);

            // TODO: again, this shouldn't be necessary here in error calculation
            CGU_UINT32 ci[3] = {0, candidateFixUpIndex1D[partition].x, candidateFixUpIndex1D[partition].y};
            CGU_Vec4ui diff;
            for (i = 0; i < 3; i++)
            {
                diff               = shared_temp[threadBase + ci[i]].pixel - endPoint[i][0];
                CGU_INT dotProduct = dot(span[i], diff);
                if (span_norm_sqr[i] > 0 && dotProduct > 0 && CGU_UINT32(dotProduct * 63.49999) > CGU_UINT32(32 * span_norm_sqr[i]))
                {
                    span[i] = -span[i];
                    swap(endPoint[i][0], endPoint[i][1]);
                }
            }

            CGU_UINT32 p_error[3] = {0, 0, 0};

            for (i = 0; i < 16; i++)
            {
                CGU_UINT32 subset_index = (bits2 >> (i * 2)) & 0x03;
                if (subset_index == 2)
                {
                    diff               = shared_temp[threadBase + i].pixel - endPoint[2][0];
                    CGU_INT dotProduct = dot(span[2], diff);
                    color_index[i]     = (span_norm_sqr[2] <= 0 || dotProduct <= 0)
                                             ? 0
                                             : ((dotProduct < span_norm_sqr[2]) ? aStep[step_selector][CGU_UINT32(dotProduct * 63.49999 / span_norm_sqr[2])]
                                                                                : aStep[step_selector][63]);
                }
                else if (subset_index == 1)
                {
                    diff               = shared_temp[threadBase + i].pixel - endPoint[1][0];
                    CGU_INT dotProduct = dot(span[1], diff);
                    color_index[i]     = (span_norm_sqr[1] <= 0 || dotProduct <= 0)
                                             ? 0
                                             : ((dotProduct < span_norm_sqr[1]) ? aStep[step_selector][CGU_UINT32(dotProduct * 63.49999 / span_norm_sqr[1])]
                                                                                : aStep[step_selector][63]);
                }
                else
                {
                    diff               = shared_temp[threadBase + i].pixel - endPoint[0][0];
                    CGU_INT dotProduct = dot(span[0], diff);
                    color_index[i]     = (span_norm_sqr[0] <= 0 || dotProduct <= 0)
                                             ? 0
                                             : ((dotProduct < span_norm_sqr[0]) ? aStep[step_selector][CGU_UINT32(dotProduct * 63.49999 / span_norm_sqr[0])]
                                                                                : aStep[step_selector][63]);
                }

                pixel_r = (endPoint[subset_index][0] * (64 - aWeight[step_selector][color_index[i]]) +
                           endPoint[subset_index][1] * aWeight[step_selector][color_index[i]] + 32U) >>
                          6;
                pixel_r.a = 255;

                CGU_Vec4ui pixel = shared_temp[threadBase + i].pixel;
                Ensure_A_Is_Larger(pixel_r, pixel);
                pixel_r -= pixel;

                CGU_UINT32 pixel_error = ComputeError(pixel_r, pixel_r);

                if (subset_index == 2)
                    p_error[2] += pixel_error;
                else if (subset_index == 1)
                    p_error[1] += pixel_error;
                else
                    p_error[0] += pixel_error;
            }

            for (i = 0; i < 3; i++)
            {
                if (p_error[i] < error[i])
                {
                    error[i]   = p_error[i];
                    final_p[i] = p;  // Borrow rotation for p
                }
            }
        }

        shared_temp[GI].error     = error[0] + error[1] + error[2];
        shared_temp[GI].partition = partition;
        shared_temp[GI].rotation  = (final_p[2] << 4) | (final_p[1] << 2) | final_p[0];
    }
    GroupSync();

    if (threadInBlock < 32)
    {
        if (shared_temp[GI].error > shared_temp[GI + 32].error)
        {
            shared_temp[GI].error     = shared_temp[GI + 32].error;
            shared_temp[GI].partition = shared_temp[GI + 32].partition;
            shared_temp[GI].rotation  = shared_temp[GI + 32].rotation;
        }
    }
    GroupSync();

    if (threadInBlock < 16)
    {
        if (shared_temp[GI].error > shared_temp[GI + 16].error)
        {
            shared_temp[GI].error     = shared_temp[GI + 16].error;
            shared_temp[GI].partition = shared_temp[GI + 16].partition;
            shared_temp[GI].rotation  = shared_temp[GI + 16].rotation;
        }
    }
    GroupSync();

    if (threadInBlock < 8)
    {
        if (shared_temp[GI].error > shared_temp[GI + 8].error)
        {
            shared_temp[GI].error     = shared_temp[GI + 8].error;
            shared_temp[GI].partition = shared_temp[GI + 8].partition;
            shared_temp[GI].rotation  = shared_temp[GI + 8].rotation;
        }
    }
    GroupSync();

    if (threadInBlock < 4)
    {
        if (shared_temp[GI].error > shared_temp[GI + 4].error)
        {
            shared_temp[GI].error     = shared_temp[GI + 4].error;
            shared_temp[GI].partition = shared_temp[GI + 4].partition;
            shared_temp[GI].rotation  = shared_temp[GI + 4].rotation;
        }
    }
    GroupSync();

    if (threadInBlock < 2)
    {
        if (shared_temp[GI].error > shared_temp[GI + 2].error)
        {
            shared_temp[GI].error     = shared_temp[GI + 2].error;
            shared_temp[GI].partition = shared_temp[GI + 2].partition;
            shared_temp[GI].rotation  = shared_temp[GI + 2].rotation;
        }
    }
    GroupSync();

    if (threadInBlock < 1)
    {
        if (shared_temp[GI].error > shared_temp[GI + 1].error)
        {
            shared_temp[GI].error     = shared_temp[GI + 1].error;
            shared_temp[GI].partition = shared_temp[GI + 1].partition;
            shared_temp[GI].rotation  = shared_temp[GI + 1].rotation;
        }

        if (g_InBuff[blockID].error > shared_temp[GI].error)
        {
            g_OutBuff1[blockID].error     = shared_temp[GI].error;
            g_OutBuff1[blockID].mode      = g_mode_id;
            g_OutBuff1[blockID].partition = shared_temp[GI].partition;
            g_OutBuff1[blockID].rotation  = shared_temp[GI].rotation;
            g_OutBuff1[blockID].data2     = 0;
        }
        else
        {
            g_OutBuff1[blockID].error          = g_InBuff[blockID].error;
            g_OutBuff1[blockID].mode           = g_InBuff[blockID].mode;
            g_OutBuff1[blockID].partition      = g_InBuff[blockID].partition;
            g_OutBuff1[blockID].index_selector = g_InBuff[blockID].index_selector;
            g_OutBuff1[blockID].rotation       = g_InBuff[blockID].rotation;
            g_OutBuff1[blockID].data2          = g_InBuff[blockID].data2;
        }
    }
#endif
}

CMP_NUMTHREADS(THREAD_GROUP_SIZE, 1, 1) void EncodeBlocks(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID)
{
    CMP_CONSTANT CGU_UINT32 MAX_USED_THREAD = 16;
    CGU_UINT32              BLOCK_IN_GROUP  = THREAD_GROUP_SIZE / MAX_USED_THREAD;
    CGU_UINT32              blockInGroup    = GI / MAX_USED_THREAD;
    CGU_UINT32              blockID         = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
    CGU_UINT32              threadBase      = blockInGroup * MAX_USED_THREAD;
    CGU_UINT32              threadInBlock   = GI - threadBase;

    CGU_UINT32 block_y = blockID / g_num_block_x;
    CGU_UINT32 block_x = blockID - block_y * g_num_block_x;
    CGU_UINT32 base_x  = block_x * BLOCK_SIZE_X;
    CGU_UINT32 base_y  = block_y * BLOCK_SIZE_Y;

    CGU_UINT32 use_cmp             = g_InBuff[blockID].mode & 0x10;
    CGU_UINT32 best_mode           = g_InBuff[blockID].mode & 0x07;
    CGU_UINT32 best_partition      = g_InBuff[blockID].partition;
    CGU_UINT32 best_index_selector = g_InBuff[blockID].index_selector;
    CGU_UINT32 best_rotation       = g_InBuff[blockID].rotation;

    if (threadInBlock < 16)
    {
        CGU_Vec4f px = g_Input.Load(CGU_Vec3ui(base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0)) * 255.0f;
        px           = clamp(px, 0.0f, 255.0f);

        CGU_Vec4ui pixel;
        pixel.r = (CGU_UINT32)px.r;
        pixel.g = (CGU_UINT32)px.g;
        pixel.b = (CGU_UINT32)px.b;
        pixel.a = (CGU_UINT32)px.a;

        if ((4 == best_mode) || (5 == best_mode))
            set_pixel_rotation(pixel, best_rotation);

        shared_temp[GI].pixel = pixel;
    }
    GroupSync();

    CGU_UINT32 bits  = blockPartitions[best_partition];
    CGU_UINT32 bits2 = blockPartitions2[best_partition - 64];

    CGU_Vec4ui ep[2];
    ep[0] = MAX_UINT;
    ep[1] = MIN_UINT;

    CGU_Vec4ui ep_quantized[2];
    CGU_Vec3ui diff3;
    CGU_Vec4ui diff4;

    CMP_UNROLL for (CGU_INT ii = 2; ii >= 0; --ii)
    {
        if (threadInBlock < 16)
        {
            CGU_Vec4ui epTemp[2];
            epTemp[0] = MAX_UINT;
            epTemp[1] = MIN_UINT;

            CGU_Vec4ui pixel = shared_temp[GI].pixel;

            CGU_UINT32 subset_index  = (bits >> threadInBlock) & 0x01;
            CGU_UINT32 subset_index2 = (bits2 >> (threadInBlock * 2)) & 0x03;
            if (0 == ii)
            {
                if ((0 == best_mode) || (2 == best_mode))
                {
                    if (0 == subset_index2)
                    {
                        epTemp[0] = epTemp[1] = pixel;
                    }
                }
                else if ((1 == best_mode) || (3 == best_mode) || (7 == best_mode))
                {
                    if (0 == subset_index)
                    {
                        epTemp[0] = epTemp[1] = pixel;
                    }
                }
                else if ((4 == best_mode) || (5 == best_mode) || (6 == best_mode))
                {
                    epTemp[0] = epTemp[1] = pixel;
                }
            }
            else if (1 == ii)
            {
                if ((0 == best_mode) || (2 == best_mode))
                {
                    if (1 == subset_index2)
                    {
                        epTemp[0] = epTemp[1] = pixel;
                    }
                }
                else if ((1 == best_mode) || (3 == best_mode) || (7 == best_mode))
                {
                    if (1 == subset_index)
                    {
                        epTemp[0] = epTemp[1] = pixel;
                    }
                }
            }
            else
            {
                if ((0 == best_mode) || (2 == best_mode))
                {
                    if (2 == subset_index2)
                    {
                        epTemp[0] = epTemp[1] = pixel;
                    }
                }
            }

            shared_temp[GI].endPoint_low  = epTemp[0];
            shared_temp[GI].endPoint_high = epTemp[1];
        }
        GroupSync();

        if (threadInBlock < 8)
        {
            shared_temp[GI].endPoint_low  = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 8].endPoint_low);
            shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 8].endPoint_high);
        }
        GroupSync();

        if (threadInBlock < 4)
        {
            shared_temp[GI].endPoint_low  = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 4].endPoint_low);
            shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 4].endPoint_high);
        }
        GroupSync();

        if (threadInBlock < 2)
        {
            shared_temp[GI].endPoint_low  = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 2].endPoint_low);
            shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 2].endPoint_high);
        }
        GroupSync();

        if (threadInBlock < 1)
        {
            shared_temp[GI].endPoint_low  = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 1].endPoint_low);
            shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 1].endPoint_high);
        }
        GroupSync();

        if (ii == (int)threadInBlock)
        {
            ep[0] = shared_temp[threadBase].endPoint_low;
            ep[1] = shared_temp[threadBase].endPoint_high;
        }
    }

    if (threadInBlock < 3)
    {
        CGU_Vec2ui P;

        if (1 == best_mode)
        {
            P = (best_rotation >> threadInBlock) & 1;
        }
        else
        {
            P = CGU_Vec2ui((best_rotation >> (threadInBlock * 2 + 0)) & 1, (best_rotation >> (threadInBlock * 2 + 1)) & 1);
        }

        if (0 == best_mode)
        {
            compress_endpoints0(ep, ep_quantized, P);
        }
        else if (1 == best_mode)
        {
            compress_endpoints1(ep, ep_quantized, P);
        }
        else if (2 == best_mode)
        {
            compress_endpoints2(ep, ep_quantized);
        }
        else if (3 == best_mode)
        {
            compress_endpoints3(ep, ep_quantized, P);
        }
        else if (4 == best_mode)
        {
            compress_endpoints4(ep, ep_quantized);
        }
        else if (5 == best_mode)
        {
            compress_endpoints5(ep, ep_quantized);
        }
        else if (6 == best_mode)
        {
            compress_endpoints6(ep, ep_quantized, P);
        }
        else  //if (7 == mode)
        {
            compress_endpoints7(ep, ep_quantized, P);
        }

        CGU_Vec4i span = cmp_castimp(ep[1] - ep[0]);

        if (best_mode < 4)
            span.w = 0;

        if ((4 == best_mode) || (5 == best_mode))
        {
            if (0 == threadInBlock)
            {
                CGU_Vec2i span_norm_sqr = CGU_Vec2i(dot(span.rgb, span.rgb), span.a * span.a);

                diff3                = shared_temp[threadBase + 0].pixel.rgb - ep[0].rgb;
                CGU_Vec2i dotProduct = CGU_Vec2i(dot(span.rgb, diff3), span.a * (shared_temp[threadBase + 0].pixel.a - ep[0].a));
                if (span_norm_sqr.x > 0 && dotProduct.x > 0 && CGU_UINT32(dotProduct.x * 63.49999) > CGU_UINT32(32 * span_norm_sqr.x))
                {
                    swap(ep[0].rgb, ep[1].rgb);
                    swap(ep_quantized[0].rgb, ep_quantized[1].rgb);
                }
                if (span_norm_sqr.y > 0 && dotProduct.y > 0 && CGU_UINT32(dotProduct.y * 63.49999) > CGU_UINT32(32 * span_norm_sqr.y))
                {
                    swap(ep[0].a, ep[1].a);
                    swap(ep_quantized[0].a, ep_quantized[1].a);
                }
            }
        }
        else  //if ((0 == mode) || (2 == mode) || (1 == mode) || (3 == mode) || (7 == mode) || (6 == mode))
        {
            CGU_INT p;
            if (0 == threadInBlock)
            {
                p = 0;
            }
            else if (1 == threadInBlock)
            {
                p = candidateFixUpIndex1D[best_partition].x;
            }
            else  //if (2 == threadInBlock)
            {
                p = candidateFixUpIndex1D[best_partition].y;
            }

            CGU_INT span_norm_sqr = dot(span, span);
            diff4                 = shared_temp[threadBase + p].pixel - ep[0];
            CGU_INT dotProduct    = dot(span, diff4);
            if (span_norm_sqr > 0 && dotProduct > 0 && CGU_UINT32(dotProduct * 63.49999) > CGU_UINT32(32 * span_norm_sqr))
            {
                swap(ep[0], ep[1]);
                swap(ep_quantized[0], ep_quantized[1]);
            }
        }

        shared_temp[GI].endPoint_low            = ep[0];
        shared_temp[GI].endPoint_high           = ep[1];
        shared_temp[GI].endPoint_low_quantized  = ep_quantized[0];
        shared_temp[GI].endPoint_high_quantized = ep_quantized[1];
    }
    GroupSync();

    if (threadInBlock < 16)
    {
        CGU_UINT32 color_index = 0;
        CGU_UINT32 alpha_index = 0;

        CGU_Vec4ui epTemp[2];

        CGU_Vec2ui indexPrec;
        if ((0 == best_mode) || (1 == best_mode))
        {
            indexPrec = 1;
        }
        else if (6 == best_mode)
        {
            indexPrec = 0;
        }
        else if (4 == best_mode)
        {
            if (0 == best_index_selector)
            {
                indexPrec = CGU_Vec2ui(2, 1);
            }
            else
            {
                indexPrec = CGU_Vec2ui(1, 2);
            }
        }
        else
        {
            indexPrec = 2;
        }

        CGU_INT subset_index;
        if ((0 == best_mode) || (2 == best_mode))
        {
            subset_index = (bits2 >> (threadInBlock * 2)) & 0x03;
        }
        else if ((1 == best_mode) || (3 == best_mode) || (7 == best_mode))
        {
            subset_index = (bits >> threadInBlock) & 0x01;
        }
        else
        {
            subset_index = 0;
        }

        epTemp[0] = shared_temp[threadBase + subset_index].endPoint_low;
        epTemp[1] = shared_temp[threadBase + subset_index].endPoint_high;

        CGU_Vec4i span = cmp_castimp(epTemp[1] - epTemp[0]);
        if (best_mode < 4)
        {
            span.w = 0;
        }

        if ((4 == best_mode) || (5 == best_mode))
        {
            CGU_Vec2i span_norm_sqr;
            span_norm_sqr.x    = dot(span.rgb, span.rgb);
            span_norm_sqr.y    = span.a * span.a;
            diff3              = shared_temp[threadBase + threadInBlock].pixel.rgb - epTemp[0].rgb;
            CGU_INT dotProduct = dot(span.rgb, diff3);
            color_index =
                (span_norm_sqr.x <= 0 || dotProduct <= 0)
                    ? 0
                    : ((dotProduct < span_norm_sqr.x) ? aStep[indexPrec.x][CGU_UINT32(dotProduct * 63.49999 / span_norm_sqr.x)] : aStep[indexPrec.x][63]);

            CGU_UINT32 diffa = shared_temp[threadBase + threadInBlock].pixel.a - epTemp[0].a;
            dotProduct       = dot(span.a, diffa);
            alpha_index =
                (span_norm_sqr.y <= 0 || dotProduct <= 0)
                    ? 0
                    : ((dotProduct < span_norm_sqr.y) ? aStep[indexPrec.y][CGU_UINT32(dotProduct * 63.49999 / span_norm_sqr.y)] : aStep[indexPrec.y][63]);

            if (best_index_selector)
            {
                swap(color_index, alpha_index);
            }
        }
        else
        {
            CGU_INT span_norm_sqr = dot(span, span);
            diff4                 = shared_temp[threadBase + threadInBlock].pixel - epTemp[0];
            CGU_INT dotProduct    = dot(span, diff4);
            color_index           = (span_norm_sqr <= 0 || dotProduct <= 0)
                                        ? 0
                                        : ((dotProduct < span_norm_sqr) ? aStep[indexPrec.x][CGU_UINT32(dotProduct * 63.49999 / span_norm_sqr)] : aStep[indexPrec.x][63]);
        }

        shared_temp[GI].error = color_index;
        shared_temp[GI].mode  = alpha_index;
    }
    GroupSync();

    if (0 == threadInBlock)
    {
        CGU_Vec4ui blockRed  = {0x001fffc0, 0xfffe0000, 0x00000001, 0x00000000};
        CGU_Vec4ui blockBlue = {0x00000040, 0xfffffff8, 0x00000001, 0x00000000};

        CGU_Vec4ui block = {0, 0, 0, 0};

        switch (best_mode)
        {
        case 0:
            block_package0(block, best_partition, threadBase);
            //block = blockRed;
            break;
        case 1:
            block_package1(block, best_partition, threadBase);
            //block = blockRed;
            break;
        case 2:
            block_package2(block, best_partition, threadBase);
            //block = blockRed;
            break;
        case 3:
            block_package3(block, best_partition, threadBase);
            //block = blockRed;
            break;
        case 4:
            block_package4(block, best_rotation, best_index_selector, threadBase);
            //block = blockRed;
            break;
        case 5:
            block_package5(block, best_rotation, threadBase);
            //block = blockRed;
            break;
        case 6:
            if (use_cmp)
            {
                block = g_InBuff[blockID].data2;
                //block = blockBlue;
            }
            else
            {
                block_package6(block, threadBase);
                //block = blockRed;
            }
            break;
        case 7:
            block_package7(block, best_partition, threadBase);
            //block = blockRed;
            break;
        default:  // error!
            block = blockRed;
            break;
        }

        g_OutBuff[blockID] = block;
    }
}

//=================================================
// This is a prototype API interface to run on CPU
// move to GPU when completed
//=================================================
CMP_STATIC CGU_Vec4ui CompressBlockBC7_CMPMSC(CMP_IN CGU_Vec4f image_src[16], CMP_IN CGU_FLOAT fquality)
{
    CMP_UNUSED(fquality);

    CGU_Vec4ui cmp = {0, 0, 0, 0};

#ifndef ASPM_HLSL
#ifdef SIMULATE_GPU
    HLSLHost(image_src);
    cmp = g_OutBuff[0];
#else
    CGU_Vec4ui image_srcui[16];
    // Transfer local pixel data over to shared global
    for (CGU_INT ii = 0; ii < 16; ii++)
    {
        image_srcui[ii].x = image_src[ii].x;
        image_srcui[ii].y = image_src[ii].y;
        image_srcui[ii].z = image_src[ii].z;
        image_srcui[ii].w = image_src[ii].w;
    }

#if defined(ENABLE_CMP_MODE6)
    CGU_Vec4ui epo_code_out[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}};
    CGU_UINT32 best_index_out[16];
    CGU_FLOAT  besterr;
    CGU_FLOAT  err;

    // Fast Encode of block
    besterr                        = cmp_GetIndexedEndPoints(epo_code_out,
                                      best_index_out,
                                      image_srcui,
                                      15,  // numEntries 0..15 (Note this function is changed from using 16)
                                      0xffffffff);
    CGU_UINT32 index_packed_out[2] = {0, 0};
    cmp_pack4bitindex32(index_packed_out, best_index_out);

#ifdef ENABLE_CMP_REFINE_MODE6_API
    // Refined for better quailty
    err = cmp_mode6_optimize_IndexAndEndPoints(epo_code_out,
                                               best_index_out,
                                               image_srcui,                     // using shared_temp[].pixel with 0 thread offset
                                               16,                              // numEntries
                                               g_modesettings[6].clusters,      // 16,
                                               g_modesettings[6].bits,          // 58,
                                               g_modesettings[6].channels3or4,  // 4,
                                               0.1f);
    cmp_pack4bitindex32(index_packed_out, best_index_out);
#endif

    // encode results
    CGU_UINT32 cmp_out6[4] = {0, 0, 0, 0};
    cmp_encode_mode6(cmp_out6, epo_code_out, index_packed_out);
    cmp.x = cmp_out6[0];
    cmp.y = cmp_out6[1];
    cmp.z = cmp_out6[2];
    cmp.w = cmp_out6[3];
#endif

#if defined(ENABLE_CMP_MODE4) || defined(ENABLE_CMP_MODE5)
    {
        CGU_UINT32 cmp_out[4] = {0, 0, 0, 0};
        Compress_mode45(cmp_out, 4, image_srcui);
        cmp.x = cmp_out[0];
        cmp.y = cmp_out[1];
        cmp.z = cmp_out[2];
        cmp.w = cmp_out[3];
    }
#endif

#if defined(ENABLE_CMP_MODE1)
    {
        CGU_UINT32 cmp_out1[5] = {0, 0, 0, 0, 0};
        cmp_process_mode(cmp_out1, image_srcui, 1);
        cmp.x = cmp_out1[0];
        cmp.y = cmp_out1[1];
        cmp.z = cmp_out1[2];
        cmp.w = cmp_out1[3];
    }
#endif

#endif  // SIMULATE_GPU
#endif  // Not HLSL

    return cmp;
}
